In [2]:
# 导入的库
from datasets import load_dataset
from pprint import pprint

from transformers import AutoTokenizer,AutoModelForCausalLM
from transformers import AutoModelForSeq2SeqLM,AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [3]:
# 导入微调训练集数据,并打印
examples = load_dataset("lamini/lamini_docs",split="train")
examples

Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1260
})

In [4]:
# 微调的结构化文本模板，一般推荐使用结构化模板
##使用结构化模板初始化数据

prompt_template="""### Question:
{question}

### Answer:"""

num_examples=len(examples["question"])
num_examples

finetuning_dataset=[]
for i in range(num_examples):
    question = examples["question"][i]
    answer=examples["answer"][i]
    text_with_prompt_template=prompt_template.format(question=question)
    finetuning_dataset.append({"question":text_with_prompt_template,"answer":answer})
finetuning_dataset

[{'question': '### Question:\nHow can I evaluate the performance and quality of the generated text from Lamini models?\n\n### Answer:',
  'answer': "There are several metrics that can be used to evaluate the performance and quality of generated text from Lamini models, including perplexity, BLEU score, and human evaluation. Perplexity measures how well the model predicts the next word in a sequence, while BLEU score measures the similarity between the generated text and a reference text. Human evaluation involves having human judges rate the quality of the generated text based on factors such as coherence, fluency, and relevance. It is recommended to use a combination of these metrics for a comprehensive evaluation of the model's performance."},
 {'question': "### Question:\nCan I find information about the code's approach to handling long-running tasks and background jobs?\n\n### Answer:",
  'answer': 'Yes, the code includes methods for submitting jobs, checking job status, and retrie

In [5]:
# 打印一条结构化后的微调数据
pprint(finetuning_dataset[0])

{'answer': 'There are several metrics that can be used to evaluate the '
           'performance and quality of generated text from Lamini models, '
           'including perplexity, BLEU score, and human evaluation. Perplexity '
           'measures how well the model predicts the next word in a sequence, '
           'while BLEU score measures the similarity between the generated '
           'text and a reference text. Human evaluation involves having human '
           'judges rate the quality of the generated text based on factors '
           'such as coherence, fluency, and relevance. It is recommended to '
           'use a combination of these metrics for a comprehensive evaluation '
           "of the model's performance.",
 'question': '### Question:\n'
             'How can I evaluate the performance and quality of the generated '
             'text from Lamini models?\n'
             '\n'
             '### Answer:'}


In [6]:
# 分词编码器
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")



In [7]:
# 定义分词编码函数
def tokenize_function(examples):
    text = examples["question"][0]+ examples["answer"][0]  #拼接问题和答案

    tokenizer.pad_token=tokenizer.eos_token # 设置padding填充编码，为0
    tokenizer_inputs=tokenizer(text,return_tensors='np',padding=True) #设置以numpy数据返回，并填充

    max_length=min(tokenizer_inputs["input_ids"].shape[1],2048) # 获取最小编码长度

    tokenizer.truncation_side='left' #设置左截断
    tokenizer_inputs=tokenizer(text,return_tensors='np',truncation=True,max_length=max_length) # 截断式编码

    return tokenizer_inputs

In [8]:
# 对load_dataset加载的Dataset数据集，进行直接分词编码，不使用结构化模板
tokenized_dataset=examples.map(tokenize_function,batched=True,batch_size=1,drop_last_batch=True)
tokenized_dataset

Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1260
})

In [9]:
# 为数据集添加lable，方便hugging face处理
# dataset = tokenized_dataset.add_column("labels",tokenized_dataset["input_ids"])
# dataset

In [10]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.1,shuffle=True,seed=123)
print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1134
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 126
    })
})


In [11]:
# 处理完成的数据其实就是hugging face可下载的数据
examples = load_dataset("lamini/lamini_docs")
print(examples)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1260
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 140
    })
})


In [12]:
# 其他的一些有趣的数据集

taylor_swift_dataset="lamini/taylor_swift"
bts_dataset="lamini/bts"
open_llms="lamini/open_llms"

dataset_taylor_swift=load_dataset(taylor_swift_dataset)
print(dataset_taylor_swift["train"][1])

Downloading readme: 100%|██████████| 573/573 [00:00<00:00, 1.96kB/s]
Downloading data: 100%|██████████| 257k/257k [00:01<00:00, 147kB/s]
Downloading data: 100%|██████████| 46.3k/46.3k [00:01<00:00, 39.2kB/s]
Generating train split: 100%|██████████| 783/783 [00:00<00:00, 146417.30 examples/s]
Generating test split: 100%|██████████| 87/87 [00:00<00:00, 24421.39 examples/s]

{'question': 'What is the most popular Taylor Swift song among millennials? How does this song relate to the millennial generation? What is the significance of this song in the millennial culture?', 'answer': 'Taylor Swift\'s "Shake It Off" is the most popular song among millennials. This song relates to the millennial generation as it is an anthem of self-acceptance and embracing one\'s individuality. The song\'s message of not letting others bring you down and to just dance it off resonates with the millennial culture, which is often characterized by a strong sense of individuality and a rejection of societal norms. Additionally, the song\'s upbeat and catchy melody makes it a perfect fit for the millennial generation, which is known for its love of pop music.', 'input_ids': [1276, 310, 253, 954, 4633, 11276, 24619, 4498, 2190, 24933, 8075, 32, 1359, 1057, 436, 4498, 14588, 281, 253, 24933, 451, 5978, 32, 1737, 310, 253, 8453, 273, 436, 4498, 275, 253, 24933, 451, 4466, 32, 37979, 24


