## 第九章作业

#### 根据硬件资源情况，在 AdvertiseGen 数据集上使用 QLoRA 微调 ChatGLM3-6B 至少 10K examples，观察 Loss 变化情况，并对比微调前后模型输出结果。

In [1]:
model_name = 'THUDM/chatglm3-6b'
model_local_dir = '/root/huggingface/hub/chatglm3-6b'
train_data_path = 'HasturOfficial/adgen'
eval_data_path = None
seed = 8
max_input_length = 512
max_output_length = 1536
lora_rank = 4
lora_alpha = 32
lora_dropout = 0.05
resume_from_checkpoint = None
prompt_text = ''
compute_dtype = 'fp32'

In [2]:
from datasets import load_dataset

dataset = load_dataset(train_data_path)
dataset

DatasetDict({
    train: Dataset({
        features: ['content', 'summary'],
        num_rows: 114599
    })
    validation: Dataset({
        features: ['content', 'summary'],
        num_rows: 1070
    })
})

In [3]:
from datasets import ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df.to_html()))

In [4]:
show_random_elements(dataset['train'], num_examples=3)

Unnamed: 0,content,summary
0,类型#裙*版型#显瘦*风格#性感*图案#圆点*图案#印花*裙领型#v领*裙领型#polo领*裙领型#翻领,精致的polo领，简洁大方，干练利落。圆点印花的设计，个性时尚。清纯中又不失性感的味道，一秒钟就可以让你变成仙女，简单又大气，可以满足绝大多数人的欣赏水平。修身显瘦的版型，十分迷人，衣身的版型很好，舒适百搭。经典时尚的v领子衬托女性的完美脸型，经典的翻领严肃设计不会显得太严肃，还可以微微露出一点性感的锁骨，若隐若现。
1,类型#裙*图案#字母*图案#文字*裙型#背带裙,甜美减龄的背带裙，加上裙子边上的字母和趣味图案设计，让这款背带裙更加的有设计感和趣味感，穿着更加的个性，吸睛，出门分分钟吸引大片男神目光！
2,类型#裙*风格#清新*图案#刺绣*裙长#连衣裙*裙款式#口袋,一款别致清新的连衣裙，采用了合理的剪裁勾勒出完美的身体曲线，尤其是拉高的腰线设计，能很好地拉长身材比例；配合超大的飞边衣袖，让宝贝活动不受束缚。用刺绣的工艺勾勒的裙摆和口袋，实用更完美是品牌价值的体现。


In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    model_local_dir,
    trust_remote_code=True,
    # revision='b098244',
    # revision='a8079d5',
)

Setting eos_token is not supported, use the default one.
Setting pad_token is not supported, use the default one.
Setting unk_token is not supported, use the default one.


In [6]:
tokenizer

ChatGLMTokenizer(name_or_path='/root/huggingface/hub/chatglm3-6b', vocab_size=64798, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='left', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	64790: AddedToken("[gMASK]", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	64792: AddedToken("sop", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	64795: AddedToken("<|user|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	64796: AddedToken("<|assistant|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
}

In [6]:
def tokenize_func(example, tokenizer, ignore_label_id=-100):
    question = prompt_text + example['content']
    if example.get('input', None) and example['input'].strip():
        question += f'\n{example["input"]}'
    
    answer = example['summary']

    q_ids = tokenizer.encode(text=question, add_special_tokens=False)
    a_ids = tokenizer.encode(text=answer, add_special_tokens=False)

    if len(q_ids) > max_input_length - 2:
        q_ids = q_ids[:max_input_length - 2]
    if len(a_ids) > max_output_length - 1:
        a_ids = a_ids[:max_output_length - 1]

    input_ids = tokenizer.build_inputs_with_special_tokens(q_ids, a_ids)
    question_length = len(q_ids) + 2
    labels = [ignore_label_id] * question_length + input_ids[question_length:]
    return {'input_ids': input_ids, 'labels': labels}

In [7]:
column_names = dataset['train'].column_names
tokenized_dataset = dataset['train'].map(
    lambda example: tokenize_func(example, tokenizer),
    batched=False,
    remove_columns=column_names
)


In [32]:
show_random_elements(tokenized_dataset, num_examples=1)

Unnamed: 0,input_ids,labels
0,"[64790, 64792, 30910, 33467, 31010, 56532, 30998, 38317, 31010, 38683, 54901, 30998, 37505, 31010, 41260, 30998, 37505, 31010, 32502, 30998, 37505, 31010, 33242, 30998, 37505, 31010, 45859, 30998, 56532, 54888, 31010, 54839, 57449, 56532, 30998, 56532, 40877, 31010, 42875, 30998, 56532, 56278, 54888, 31010, 55426, 55316, 56278, 30910, 34746, 54839, 57449, 55090, 32859, 43385, 31123, 56532, 56158, 54807, 33284, 56142, 55251, 32144, 34481, 31123, 32985, 33242, 54530, 41260, 45859, 31123, 33222, 54536, 54888, 31123, 35765, 34372, 33481, 31155, 55673, 54815, 55251, 52667, 56142, 55474, 31735, 31123, 33612, 41256, 31123, 35021, 31735, 54706, 31123, 55432, 54557, 54550, 32019, 32375, 32799, 31155, ...]","[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 30910, 34746, 54839, 57449, 55090, 32859, 43385, 31123, 56532, 56158, 54807, 33284, 56142, 55251, 32144, 34481, 31123, 32985, 33242, 54530, 41260, 45859, 31123, 33222, 54536, 54888, 31123, 35765, 34372, 33481, 31155, 55673, 54815, 55251, 52667, 56142, 55474, 31735, 31123, 33612, 41256, 31123, 35021, 31735, 54706, 31123, 55432, 54557, 54550, 32019, 32375, 32799, 31155, ...]"


In [10]:
tokenized_dataset = tokenized_dataset.shuffle(seed=seed).select(range(12000))
tokenized_dataset = tokenized_dataset.flatten_indices()

Flattening the indices:   0%|          | 0/12000 [00:00<?, ? examples/s]

In [11]:
import torch
from typing import List, Dict, Optional

class DataCollatorForChatGLM:
    def __init__(self, pad_token_id: int, max_length: int = 2048, ignore_label_id: int = -100):
        self.pad_token_id = pad_token_id
        self.max_length = max_length
        self.ignore_label_id = ignore_label_id
    
    def __call__(self, batch_data: List[Dict[str, List]]) -> Dict[str, torch.Tensor]:
        len_list = [len(d['input_ids']) for d in batch_data]
        batch_max_len = max(len_list)

        input_ids, labels = [], []
        for len_of_d, d in sorted(zip(len_list, batch_data), key=lambda x: -x[0]):
            pad_len = batch_max_len - len_of_d
            ids = d['input_ids'] + [self.pad_token_id] * pad_len
            label = d['labels'] + [self.ignore_label_id] * pad_len
            if batch_max_len > self.max_length:
                ids = ids[:self.max_length]
                label = label[:self.max_length]
            input_ids.append(torch.LongTensor(ids))
            labels.append(torch.LongTensor(label))
        
        input_ids = torch.stack(input_ids)
        labels = torch.stack(labels)
        return {'input_ids': input_ids, 'labels': labels}

In [12]:
data_collator = DataCollatorForChatGLM(pad_token_id=tokenizer.pad_token_id)

In [14]:
from transformers import AutoModel, BitsAndBytesConfig

_compute_dtype_map = {
    'fp32': torch.float32,
    'fp16': torch.float16,
    'bf16': torch.bfloat16,
}

q_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=_compute_dtype_map['bf16'],
)

In [15]:
model = AutoModel.from_pretrained(
    model_local_dir,
    quantization_config=q_config,
    device_map='auto',
    trust_remote_code=True,
    # revision='b098244',
    # revision='a8079d5',
)

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [16]:
memory_footprint_bytes = model.get_memory_footprint()
memory_footprint_mib = memory_footprint_bytes / (1024 * 1024)
print(f'{memory_footprint_mib:.2f}MiB')

3739.69MiB


In [17]:
from peft import TaskType, LoraConfig, get_peft_model, prepare_model_for_kbit_training

kbit_model = prepare_model_for_kbit_training(model)

In [18]:
from peft.utils import TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING

target_modules = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING['chatglm']
target_modules

['query_key_value']

In [19]:
lora_config = LoraConfig(
    target_modules=target_modules,
    r=lora_rank,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias="none",
    inference_mode=False,
    task_type=TaskType.CAUSAL_LM,
)

In [20]:
qlora_model = get_peft_model(kbit_model, lora_config)
qlora_model.print_trainable_parameters()

trainable params: 974,848 || all params: 6,244,558,848 || trainable%: 0.01561115883009451


In [21]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir=f'models/{model_name}',
    per_device_train_batch_size=6,
    gradient_accumulation_steps=1,
    # per_device_eval_batch_size=8,
    learning_rate=1e-3,
    num_train_epochs=1,
    lr_scheduler_type='linear',
    warmup_ratio=0.1,
    logging_steps=100,
    save_strategy='steps',
    save_steps=100,
    save_total_limit=2,
    # evaluation_strategy='steps',
    # eval_steps=500,
    optim='adamw_torch',
    fp16=True,
)

In [22]:
trainer = Trainer(
    model=qlora_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

In [23]:
trainer.train()

Step,Training Loss
100,4.0727
200,3.5641
300,3.5221
400,3.5058
500,3.4779
600,3.4169
700,3.4165
800,3.3848
900,3.3357
1000,3.3791


TrainOutput(global_step=2000, training_loss=3.374401428222656, metrics={'train_runtime': 703.8014, 'train_samples_per_second': 17.05, 'train_steps_per_second': 2.842, 'total_flos': 6.569251339862016e+16, 'train_loss': 3.374401428222656, 'epoch': 1.0})

In [24]:
trainer.model.save_pretrained(f'models/{model_name}')

#### 推理和对比

In [3]:
import torch
from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel, PeftConfig

model_name = 'THUDM/chatglm3-6b'
peft_model_path = f'models/{model_name}'
peft_config = PeftConfig.from_pretrained(peft_model_path)
q_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float32
)

base_model = AutoModel.from_pretrained(
    peft_config.base_model_name_or_path,
    quantization_config=q_config,
    trust_remote_code=True,
    device_map='auto'
)

base_model.requires_grad_(False)
base_model.eval()

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

ChatGLMForConditionalGeneration(
  (transformer): ChatGLMModel(
    (embedding): Embedding(
      (word_embeddings): Embedding(65024, 4096)
    )
    (rotary_pos_emb): RotaryEmbedding()
    (encoder): GLMTransformer(
      (layers): ModuleList(
        (0-27): 28 x GLMBlock(
          (input_layernorm): RMSNorm()
          (self_attention): SelfAttention(
            (query_key_value): Linear4bit(in_features=4096, out_features=4608, bias=True)
            (core_attention): CoreAttention(
              (attention_dropout): Dropout(p=0.0, inplace=False)
            )
            (dense): Linear4bit(in_features=4096, out_features=4096, bias=False)
          )
          (post_attention_layernorm): RMSNorm()
          (mlp): MLP(
            (dense_h_to_4h): Linear4bit(in_features=4096, out_features=27392, bias=False)
            (dense_4h_to_h): Linear4bit(in_features=13696, out_features=4096, bias=False)
          )
        )
      )
      (final_layernorm): RMSNorm()
    )
    (output_la

In [4]:
input_text = '类型#裙*版型#显瘦*风格#文艺*风格#简约*图案#印花*图案#撞色*裙下摆#压褶*裙长#连衣裙*裙领型#圆领'
print(f'输入：\n{input_text}')
tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path, trust_remote_code=True)

Setting eos_token is not supported, use the default one.
Setting pad_token is not supported, use the default one.
Setting unk_token is not supported, use the default one.


输入：
类型#裙*版型#显瘦*风格#文艺*风格#简约*图案#印花*图案#撞色*裙下摆#压褶*裙长#连衣裙*裙领型#圆领


In [5]:
response, history = base_model.chat(tokenizer, input_text)
print(f'ChatGLM3-6B 微调前：\n{response}')



ChatGLM3-6B 微调前：
这款连衣裙以其文艺风格和简约设计吸引了我的注意。它采用了印花图案和撞色设计，让整个裙子看起来更有活力。压褶的裙下摆和圆领设计增添了时尚感和优雅感。同时，裙子的长度适中，既能展现身材，又不会显得过于暴露。总的来说，这是一款既时尚又大气的连衣裙。


In [6]:
peft_model = PeftModel.from_pretrained(base_model, peft_model_path)
response, history = peft_model.chat(tokenizer, input_text)
print(f'ChatGLM3-6B 微调后：\n{response}')

ChatGLM3-6B 微调后：
连衣裙采用简约大方的圆领设计，修饰脖颈，修饰脸型，打造精致脸型。撞色印花点缀，丰富视觉效果，增添文艺气息。裙摆采用压褶设计，丰富层次，打造优雅气质。
