## 第七章作业

#### 1. 在“LoRA 低秩适配 OpenAI Whisper-Large-V2 语音识别任务”中，为中文语料的训练过程增加过程评估，观察 Train Loss 和 Validation Loss 变化。课程代码（ https://github.com/DjangoPeng/LLM-quickstart/blob/main/peft/peft_lora_whisper-large-v2.ipynb ）

In [1]:
model_name = "openai/whisper-large-v2"
model_dir = "models/whisper-large-v2-asr-int8"

language = "Chinese (China)"
language_abbr = "zh-CN"
task = "transcribe"
dataset_name = "mozilla-foundation/common_voice_11_0"

batch_size = 64

In [None]:
from datasets import load_dataset, DatasetDict, Audio

common_voice = DatasetDict()
common_voice['train'] = load_dataset(dataset_name, language_abbr, split='train', trust_remote_code=True)
common_voice['validation'] = load_dataset(dataset_name, language_abbr, split='validation', trust_remote_code=True)

In [None]:
from transformers import AutoFeatureExtractor, AutoTokenizer, AutoProcessor

feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
tokenzier = AutoTokenizer.from_pretrained(model_name, language=language, task=task)
processor = AutoProcessor.from_pretrained(model_name, language=language, task=task)

In [None]:
common_voice = common_voice.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])
common_voice = common_voice.cast_column('audio', Audio(sampling_rate=16000))

In [None]:
def prepare_dataset(batch):
    audio = batch['audio']
    batch['input_features'] = feature_extractor(audio['array'], sampling_rate=audio['sampling_rate']).input_features[0]
    batch['labels'] = tokenzier(batch['sentence']).input_ids
    return batch

In [None]:
small_common_voice = DatasetDict()
small_common_voice['train'] = common_voice['train'].shuffle(seed=16).select(range(640))
small_common_voice['validation'] = common_voice['validation'].shuffle(seed=16).select(range(320))
tokenized_common_voice = small_common_voice.map(prepare_dataset)

In [None]:
tokenzied_common_voice = common_voice.map(prepare_dataset, num_proc=8)
# tokenzied_common_voice = common_voice.map(prepare_dataset)

In [None]:
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{'input_features': feature['input_features']} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors='pt')

        label_features = [{'input_ids': feature['labels']} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors='pt')

        labels = labels_batch['input_ids'].masked_fill(labels_batch.attention_mask.ne(1), -100)

        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]
        
        batch['labels'] = labels
        return batch

In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [None]:
from transformers import AutoModelForSpeechSeq2Seq

model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name, load_in_8bit=True, device_map='auto')
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

In [None]:
from peft import prepare_model_for_int8_training

model = prepare_model_for_int8_training(model)

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=4,
    lora_alpha=64,
    target_modules=['q_proj', 'v_proj'],
    lora_dropout=0.05,
    bias='none'
)

In [None]:
peft_model = get_peft_model(model, config)
peft_model.print_trainable_parameters()

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir=model_dir,
    per_device_train_batch_size=batch_size,
    learning_rate=1e-3,
    num_train_epochs=1,
    evaluation_strategy='epoch',
    warmup_steps=50,
    fp16=True,
    per_device_eval_batch_size=batch_size,
    generation_max_length=128,
    logging_steps=10,
    remove_unused_columns=False,
    label_names=['labels']
)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=peft_model,
    train_dataset=tokenized_common_voice['train'],
    eval_dataset=tokenized_common_voice['validation'],
    data_collator=data_collator,
    tokenizer=processor.feature_extractor
)
peft_model.config.use_cache = False

In [None]:
from transformers import TrainerCallback

class LossCallback(TrainerCallback):
    def __init__(self) -> None:
        super().__init__()
        self.train_losses = []
        self.eval_losses = []
    
    def on_log(self, args, state, control, logs=None **kwargs):
        if 'loss' in logs:
            self.train_losses.append((state.global_step, logs['logss']))
        if 'eval_loss' in logs:
            self.eval_losses.append((state.global_step, logs['eval_loss']))

loss_callback = LossCallback()

In [None]:
trainer.add_callback(loss_callback)
trainer.train()

In [None]:
trainer.save_model(model_dir)

#### 绘制损失值变化图

In [None]:
import matplotlib.pyplot as plt

train_steps, train_losses = zip(*loss_callback.train_losses)
eval_steps, eval_losses = zip(*loss_callback.eval_losses)

plt.figure(figsize=(10, 5))
plt.plot(train_steps, train_losses, label='Train Loss')
plt.plot(eval_steps, eval_losses, label='Validation Loss')
plt.xlabel('Step')
plt.ylabel('Loss')
plt.title('Train and Validation Loss')
plt.legend()
plt.show()


#### 2. 在“LoRA 低秩适配 OpenAI Whisper-Large-V2 语音识别任务”中，当 LoRA 模型训练完成后，使用测试集进行完整的模型评估。课程代码（ https://github.com/DjangoPeng/LLM-quickstart/blob/main/peft/peft_lora_whisper-large-v2.ipynb ）

In [None]:
from peft import PeftConfig, PeftModel

peft_config = PeftConfig.from_pretrained(model_dir)

base_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    peft_config.base_model_name_or_path,
    load_in_8bit=True,
    device_map='auto'
)
base_model.requires_grad_(False)

In [None]:
peft_model = PeftModel.from_pretrained(base_model, model_dir)
peft_model.eval()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
processor = AutoProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
feature_extractor = processor.feature_extractor

In [None]:
common_voice2 = DatasetDict()
common_voice2['test'] = load_dataset(dataset_name, language_abbr, split='test', trust_remote_code=True)
common_voice2 = common_voice2.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])
common_voice2 = common_voice2.cast_column('audio', Audio(sampling_rate=16000))

In [None]:
tokenized_common_voice2 = common_voice2.map(prepare_dataset)

In [None]:
import evaluate

metric = evaluate.load('wer')

In [None]:
from torch.utils.data import DataLoader

eval_dataloader = DataLoader(tokenized_common_voice2['test'], batch_size=batch_size, collate_fn=data_collator)

In [None]:
from tqdm import tqdm
import gc
import numpy as np

for step, batch in enumerate(tqdm(eval_dataloader)):
    with torch.cuda.amp.autocast():
        with torch.no_grad():
            generated_tokens = (
                peft_model.generate(
                    input_features=batch['input_features'].to('cuda'),
                    decoder_input_ids=batch['labels'][:, :4].to('cuda'),
                    max_new_tokens=255,
                ).cpu().numpy()
            )
            labels = batch['labels'].cpu().numpy()
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
            metric.add_batch(predictions=decoded_preds, references=decoded_labels)
    
    del generated_tokens, labels, batch
    gc.collect()


In [None]:
wer = 100 * metric.compute()
print(f'{wer=}%')