# 对话机器人

In [2]:
from datasets import Dataset

from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer

In [3]:
dataset = Dataset.load_from_disk("datas/alpaca_data_zh")

In [4]:
dataset

Dataset({
    features: ['output', 'input', 'instruction'],
    num_rows: 26858
})

In [5]:
dataset[10]

{'output': '水的沸点是指水在大气压强下由液态变成气态的温度。在标准大气压（101.325 kPa）下，水的沸点约为 100°C （摄氏度） 或 212°F（华氏度）。 需要注意的是，水的沸点会随着海拔的改变而改变，例如，在高海拔地区，空气压强降低，水的沸点也会降低。此外，水中溶解物质的多少也会影响沸点，如盐水的沸点要高于纯水的沸点。',
 'input': '',
 'instruction': '查水的沸点。'}

In [6]:
tokenizer = AutoTokenizer.from_pretrained("/root/autodl-tmp/models/bloom-389")

In [7]:
def process_func(example):
    MAX_LENGTH = 512
    instruction = "\n".join(
        ["Human: ", example["instruction"], example["input"]]).strip() + "\n\nAssistant: "
    inputs = tokenizer(instruction)
    response = tokenizer(example["output"])
    input_ids = inputs["input_ids"] + response["input_ids"] + [tokenizer.eos_token_id]
    attention_mask = inputs["attention_mask"] + response["attention_mask"] + [1]
    labels = [-100] * len(inputs["input_ids"]) + response["input_ids"] + [tokenizer.eos_token_id]

    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [8]:
tokenized_data = dataset.map(process_func, remove_columns=dataset.column_names)

Map:   0%|          | 0/26858 [00:00<?, ? examples/s]

In [9]:
print(tokenized_data[0])

{'input_ids': [23069, 29, 2705, 6583, 24772, 8995, 13533, 671, 189, 4122, 15263, 29, 210, 4744, 583, 6583, 24772, 8995, 13533, 1022, 189, 189, 20, 17, 210, 6583, 8416, 3228, 420, 8634, 1900, 13648, 8416, 5625, 355, 1202, 29011, 553, 30355, 1298, 15599, 355, 961, 4872, 34650, 5980, 355, 10915, 15342, 7761, 355, 1403, 11472, 6189, 20465, 671, 189, 21, 17, 210, 20122, 13660, 420, 8634, 13869, 20189, 373, 17070, 553, 16382, 553, 1204, 6165, 1430, 641, 14562, 16130, 24251, 15502, 7984, 355, 7981, 1220, 6538, 553, 1220, 14562, 641, 13545, 10249, 355, 714, 6583, 24772, 13660, 11297, 671, 189, 22, 17, 210, 17672, 16272, 420, 17672, 1063, 13966, 5980, 18688, 355, 30645, 8634, 1638, 7900, 954, 3779, 210, 38858, 17672, 420, 14054, 17672, 11472, 15375, 10891, 355, 4872, 8416, 7442, 355, 1403, 5323, 4001, 16885, 14721, 1249, 420, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [11]:
model = AutoModelForCausalLM.from_pretrained("/root/autodl-tmp/models/bloom-389")

In [12]:
args = TrainingArguments(
    output_dir="trained/model_for_chatbot",
    per_device_train_batch_size=4,
    logging_steps=100,
    num_train_epochs=1,
    gradient_accumulation_steps=8
)

In [13]:
trainer = Trainer(
    args=args,
    model=model,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer),
    train_dataset=tokenized_data
)

  trainer = Trainer(


In [21]:
trainer.train()

Step,Training Loss
100,1.7914
200,1.5239
300,1.5844
400,1.6748
500,1.7759
600,1.8724
700,2.0536
800,2.1929


TrainOutput(global_step=840, training_loss=1.8176349730718704, metrics={'train_runtime': 374.4953, 'train_samples_per_second': 71.718, 'train_steps_per_second': 2.243, 'total_flos': 5844548961927168.0, 'train_loss': 1.8176349730718704, 'epoch': 1.0})

In [22]:
from transformers import pipeline

In [23]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

Device set to use cuda:0


In [25]:
pipe("Human: \n" + "考试有哪些技巧?\n" + "\n\nAssistant: ", 
    max_length=256, 
    do_sample=True)

Both `max_new_tokens` (=256) and `max_length`(=256) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[{'generated_text': 'Human: \n考试有哪些技巧?\n\n\nAssistant: 考试技巧有很多，下面给出一些关于考试技巧的常见做法：\n\n1. 预习：预习可以帮助你熟悉考试内容，提高理解能力。\n\n2. 复习：复习可以帮助你巩固所学知识，减轻压力，提高应考能力。\n\n3. 备考：备考可以帮助你提前准备考试，保持头脑清醒，更好地应对考题。\n\n4. 分析:分析：分析可以帮助你更好地理解考试内容，提高应考能力。\n\n5. 准备题：准备题可以帮助你巩固所学知识，提高应考能力。'}]