# 对数据进行处理

In [1]:
from datasets import load_dataset

dataset = load_dataset("eli5_category", split="train[:5000]")
dataset = dataset.train_test_split(0.3).flatten()
dataset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['q_id', 'title', 'selftext', 'category', 'subreddit', 'answers.a_id', 'answers.text', 'answers.score', 'answers.text_urls', 'title_urls', 'selftext_urls'],
        num_rows: 3500
    })
    test: Dataset({
        features: ['q_id', 'title', 'selftext', 'category', 'subreddit', 'answers.a_id', 'answers.text', 'answers.score', 'answers.text_urls', 'title_urls', 'selftext_urls'],
        num_rows: 1500
    })
})

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("your model path")
tokenizer

RobertaTokenizerFast(name_or_path='D:/Desktop/learn/instance/model/DistilRoBERTa', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}
)

In [3]:
def preprocess_function(examples):
    # examples["answers_text"]中数据存在[[]]样子
    return tokenizer([" ".join(x) for x in examples["answers.text"]])

In [4]:
tokenizede_data = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
tokenizede_data

Map:   0%|          | 0/3500 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (637 > 512). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 3500/3500 [00:01<00:00, 2369.91 examples/s]
Map: 100%|██████████| 1500/1500 [00:00<00:00, 2395.65 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 3500
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1500
    })
})

In [5]:
block_size = 128

def group_text(examples):
    
    concatenated_examples = {k:sum(examples[k],[]) for k in examples.keys()}

    total_length = len(concatenated_examples[list(examples.keys())[0]])

    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    
    result = {
        k:[v[i:i+block_size] for i in range(0, total_length, block_size)]
        for k, v in concatenated_examples.items()
    }
    return result

In [6]:
grouped_dataset = tokenizede_data.map(group_text, batched=True)
grouped_dataset

Map: 100%|██████████| 3500/3500 [00:11<00:00, 298.27 examples/s]
Map: 100%|██████████| 1500/1500 [00:04<00:00, 334.99 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 9285
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 3877
    })
})

In [7]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
# 文本中被替换成为[MASK]的比例
# 被MASK的位置填充成为原来的ids，没有被MASK的被填为-100
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,mlm_probability=0.15)
tokenizer

RobertaTokenizerFast(name_or_path='D:/Desktop/learn/instance/model/DistilRoBERTa', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '</s>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}
)

# 训练模型

In [7]:
from transformers import AutoModelForMaskedLM, TrainingArguments, Trainer

model = AutoModelForMaskedLM.from_pretrained("your model path")
model

Some weights of the model checkpoint at D:/Desktop/learn/instance/model/DistilRoBERTa were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm)

In [9]:
training_args = TrainingArguments(
    output_dir="./checkpoint",
    per_device_train_batch_size=2,
    learning_rate=5e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_steps=3,
    logging_strategy="steps",
    logging_steps=20,
    per_device_eval_batch_size=4,
    eval_strategy="steps",
    eval_steps=20,
    save_strategy="epoch",
    save_total_limit=2,
)

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=grouped_dataset["train"],
    eval_dataset=grouped_dataset["test"],
    data_collator=data_collator
)

In [11]:
trainer.train()

Step,Training Loss,Validation Loss


KeyboardInterrupt: 

# 推断

## 使用pipeline进行推断

In [2]:
from transformers import pipeline

pipe = pipeline(task="fill-mask",model="D:/Desktop/learn/instance/model/DistilRoBERTa")

text = "The Milky Way is a <mask> galaxy."

pipe(text, top_k=3)

Some weights of the model checkpoint at D:/Desktop/learn/instance/model/DistilRoBERTa were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


[{'score': 0.6413401365280151,
  'token': 21300,
  'token_str': ' spiral',
  'sequence': 'The Milky Way is a spiral galaxy.'},
 {'score': 0.15504248440265656,
  'token': 30794,
  'token_str': ' dwarf',
  'sequence': 'The Milky Way is a dwarf galaxy.'},
 {'score': 0.03614896163344383,
  'token': 2232,
  'token_str': ' massive',
  'sequence': 'The Milky Way is a massive galaxy.'}]

## 手动进行推断

In [9]:
import torch

text = "The Milky Way is a <mask> galaxy."
inputs = tokenizer(text, return_tensors="pt")
# 返回的是一个元组，第一个元素代表的是批量索引，第二个元素代表位置
mask_token_idx = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_idx, inputs


(tensor([6]),
 {'input_ids': tensor([[    0,   133, 36713,  4846,    16,    10, 50264, 22703,     4,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])})

In [23]:
outputs = model(**inputs)
logits = outputs.logits[0]
# logits.shape => batch_size, sequence_length, vocab_length
mask_token_logits = logits[mask_token_idx]

top_3_tokens = torch.topk(mask_token_logits, 3, dim=1).indices[0].tolist()

for token in top_3_tokens:
    print(text.replace(tokenizer.mask_token, tokenizer.decode([token])))
top_3_tokens

The Milky Way is a  spiral galaxy.
The Milky Way is a  dwarf galaxy.
The Milky Way is a  massive galaxy.


[21300, 30794, 2232]