# 进行数据处理 => 就是转化成为model可以直接使用的tensor

1. 加载tokenizer
2. process_function函数（没有进行填充，可以使用transformers内置的处理器）
3. DataDict进行映射，会有一个默认的参数可以拿到Dataset中的一条记录（注意：DataDict可以直接进行映射，先不要拆分成train和test）
4. 根据任务的不同对数据设置不同的数据处理器

In [1]:
from datasets import load_dataset

data = load_dataset(r"dataset path")
data

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})

In [2]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("your model path")

  tokenizer = AutoTokenizer.from_pretrained("D:\Desktop\learn\instance\model")


In [3]:
def preprocess_function(examples):
    # return tokenizer(examples["text"], truncation=True)
    tokenized = tokenizer(examples["text"], truncation=True)
    input_ids = tokenized["input_ids"]
    attention_mask = tokenized["attention_mask"]
    labels = examples["label"]
    return {
        "input_ids":input_ids,
        "attention_mask":attention_mask,
        "labels":labels
    }
tokenized_data = data.map(preprocess_function, remove_columns=data["train"].column_names)
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 25000
    })
})

In [4]:
tokenized_data["test"][0]

{'input_ids': [101,
  1045,
  2293,
  16596,
  1011,
  10882,
  1998,
  2572,
  5627,
  2000,
  2404,
  2039,
  2007,
  1037,
  2843,
  1012,
  16596,
  1011,
  10882,
  5691,
  1013,
  2694,
  2024,
  2788,
  2104,
  11263,
  25848,
  1010,
  2104,
  1011,
  12315,
  1998,
  28947,
  1012,
  1045,
  2699,
  2000,
  2066,
  2023,
  1010,
  1045,
  2428,
  2106,
  1010,
  2021,
  2009,
  2003,
  2000,
  2204,
  2694,
  16596,
  1011,
  10882,
  2004,
  17690,
  1019,
  2003,
  2000,
  2732,
  10313,
  1006,
  1996,
  2434,
  1007,
  1012,
  10021,
  4013,
  3367,
  20086,
  2015,
  1010,
  10036,
  19747,
  4520,
  1010,
  25931,
  3064,
  22580,
  1010,
  1039,
  2290,
  2008,
  2987,
  1005,
  1056,
  2674,
  1996,
  4281,
  1010,
  1998,
  16267,
  2028,
  1011,
  8789,
  3494,
  3685,
  2022,
  9462,
  2007,
  1037,
  1005,
  16596,
  1011,
  10882,
  1005,
  4292,
  1012,
  1006,
  1045,
  1005,
  1049,
  2469,
  2045,
  2024,
  2216,
  1997,
  2017,
  2041,
  2045,
  2040,
  2228,

In [5]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

# 写评估函数

1. 根据任务选取不同的评估指标（从Task中查找）
2. 使用evaluate设置评估器
3. 写模型的评估函数（要返回使用评估器计算的结果）【这里有一个默认的参数可以拿到模型计算的结果以及labels】

In [6]:
import evaluate
accuracy = evaluate.load("../metric_accuracy.py")
accuracy

EvaluationModule(name: "accuracy", module_type: "metric", features: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)}, usage: """
Args:
    predictions (`list` of `int`): Predicted labels.
    references (`list` of `int`): Ground truth labels.
    normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True.
    sample_weight (`list` of `float`): Sample weights Defaults to None.

Returns:
    accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy.

Examples:

    Example 1-A simple example
        >>> accuracy_metric = evaluate.load("accuracy")
        >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
        >>> print(results)
    

In [7]:
import numpy as np

def compute_metrics(eval_pred):
    prediction, labels = eval_pred
    predictions = np.argmax(prediction, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)
    

# 模型训练
1. 使用AutoModelFor。。。调用预先训练的模型
2. 对于分类任务而言，需要传入label2id和id2label
3. 定义TrainingArguments
4. 定义Trainer并且进行训练

In [8]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(r"your model path", num_labels=2, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at D:\Desktop\learn\instance\model and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
training_args = TrainingArguments(
    output_dir = "./checkpoint",
    logging_strategy="steps",
    logging_steps=100,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=3e-5,
    save_strategy='epoch',
    save_total_limit=3,
    eval_strategy="steps",
    eval_steps=100,
    num_train_epochs=3,
    warmup_steps=50,
    weight_decay=0.01,
    # load_best_model_at_end=True
)

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"].shuffle().select(range(1200)),
    eval_dataset=tokenized_data["test"].shuffle().select(range(100)),
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [12]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
100,0.6732,0.612326,0.66
200,0.4587,0.610188,0.79
300,0.5367,0.445697,0.84
400,0.2387,0.606936,0.86
500,0.3526,0.50686,0.85
600,0.2193,0.651298,0.86
700,0.091,0.639959,0.86
800,0.0936,0.604286,0.88
900,0.0707,0.631831,0.88


TrainOutput(global_step=900, training_loss=0.30383392598893905, metrics={'train_runtime': 343.1919, 'train_samples_per_second': 10.49, 'train_steps_per_second': 2.622, 'total_flos': 393013178485968.0, 'train_loss': 0.30383392598893905, 'epoch': 3.0})

In [36]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
