## 第四章作业

In [2]:
import os

os.environ['TRANSFORMERS_CACHE'] = '/mnt/sda/huggingface/hub/'
os.environ['HF_HOME'] = '/mnt/sda/huggingface/'
# os.environ['http_proxy'] = 'http://127.0.0.1:7890'
# os.environ['https_proxy'] = 'http://127.0.0.1:7890'

In [3]:
import collections
import random
import pandas as pd
import numpy as np
import evaluate
import datasets
from IPython.display import display, HTML
from tqdm.auto import tqdm



In [4]:
from datasets import load_dataset, load_metric
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForQuestionAnswering,
    Trainer,
    TrainingArguments,
    PreTrainedTokenizerFast,
    default_data_collator
)

In [5]:
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = random.sample(range(len(dataset)), k=num_examples)

    df = pd.DataFrame(dataset[picks])
    for col, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[col] = df[col].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

#### 1. 使用完整的 YelpReviewFull 数据集训练，对比看 Acc 最高能到多少。课程代码（ https://github.com/DjangoPeng/LLM-quickstart/blob/main/transformers/fine-tune-quickstart.ipynb ）

In [5]:
dataset = load_dataset('yelp_review_full')
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [7]:
train_dataset = tokenized_datasets['train'].shuffle(seed=42).select(range(100000))
eval_dataset = tokenized_datasets['test'].shuffle(seed=42).select(range(1000))
test_dataset = tokenized_datasets['test'].shuffle(seed=64).select(range(100))

In [8]:
model_dir = 'models/bert-base-cased-finetune-yelp'
training_args = TrainingArguments(
    output_dir=model_dir,
    overwrite_output_dir=True,
    evaluation_strategy='epoch',
    per_device_train_batch_size=20,
    num_train_epochs=3,
    logging_steps=30,
    save_steps=500,
    save_total_limit=5
)

In [9]:
metric = evaluate.load('accuracy')

def compute_metric(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metric,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [11]:
trainer.train(resume_from_checkpoint=True)

Epoch,Training Loss,Validation Loss,Accuracy
2,0.5112,0.790075,0.689
3,0.4702,0.812829,0.693


TrainOutput(global_step=15000, training_loss=0.22096032746632893, metrics={'train_runtime': 11092.8314, 'train_samples_per_second': 27.044, 'train_steps_per_second': 1.352, 'total_flos': 7.89354427392e+16, 'train_loss': 0.22096032746632893, 'epoch': 3.0})

In [12]:
trainer.evaluate(test_dataset)

{'eval_loss': 1.0953632593154907,
 'eval_accuracy': 0.63,
 'eval_runtime': 2.9788,
 'eval_samples_per_second': 33.571,
 'eval_steps_per_second': 4.364,
 'epoch': 3.0}

In [13]:
trainer.save_model(model_dir)

In [14]:
trainer.save_state()

#### 2. 加载本地保存的模型，进行评估和再训练更高的 F1 Score。课程代码（ https://github.com/DjangoPeng/LLM-quickstart/blob/main/transformers/fine-tune-QA.ipynb ）

In [6]:
squad_v2 = False
model_checkpoint = 'distilbert-base-uncased'
model_dir2 = f"models/{model_checkpoint}-finetuned-squad"
batch_size2 = 64
# The maximum length of a feature (question and context)
max_length = 384
# The authorized overlap between two part of the context when splitting it is needed.
doc_stride = 128

In [7]:
dataset2 = load_dataset('squad_v2' if squad_v2 else 'squad')
tokenizer2 = AutoTokenizer.from_pretrained(model_checkpoint)
pad_on_right = tokenizer2.padding_side == 'right'



In [8]:
assert isinstance(tokenizer2, PreTrainedTokenizerFast)

##### 2-1 模型训练

In [9]:
def prepare_train_features(examples):
    examples['question'] = [q.lstrip() for q in examples['question']]
    tokenized_examples = tokenizer2(
        examples['question' if pad_on_right else 'context'],
        examples['context' if pad_on_right else 'question'],
        truncation='only_second' if pad_on_right else 'only_first',
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding='max_length',
    )
    sample_mapping = tokenized_examples.pop('overflow_to_sample_mapping')
    offset_mapping = tokenized_examples.pop('offset_mapping')
    start_positions = []
    end_positions = []
    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples['input_ids'][i]
        seq_ids = tokenized_examples.sequence_ids(i)
        cls_index = input_ids.index(tokenizer2.cls_token_id)
        sample_index = sample_mapping[i]
        answers = examples['answers'][sample_index]
        if len(answers['answer_start']) == 0:
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            start_char = answers['answer_start'][0]
            end_char = start_char + len(answers['text'][0])
            token_start_index = 0
            while seq_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1
            token_end_index = len(input_ids) - 1
            while seq_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1
            
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                start_positions.append(cls_index)
                end_positions.append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                start_positions.append(token_start_index - 1)
                while token_end_index < len(offsets) and offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                end_positions.append(token_end_index + 1)
    tokenized_examples['start_positions'] = start_positions
    tokenized_examples['end_positions'] = end_positions
    return tokenized_examples

In [10]:
tokenized_datasets2 = dataset2.map(
    prepare_train_features, 
    batched=True, 
    remove_columns=dataset2['train'].column_names
)

In [15]:
model2 = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
args2 = TrainingArguments(
    output_dir=model_dir2,
    overwrite_output_dir=True,
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size2,
    per_device_eval_batch_size=batch_size2,
    num_train_epochs=3,
    weight_decay=0.01,
    save_steps=500,
    save_total_limit=5
)

In [17]:
trainer2 = Trainer(
    model2,
    args2,
    train_dataset=tokenized_datasets2['train'],
    eval_dataset=tokenized_datasets2['validation'],
    data_collator=default_data_collator,
    tokenizer=tokenizer2
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
trainer2.train(resume_from_checkpoint=True)

Epoch,Training Loss,Validation Loss
1,1.4728,1.259785
2,1.1072,1.175819
3,0.9709,1.168556


TrainOutput(global_step=4152, training_loss=1.2928013553509135, metrics={'train_runtime': 8652.8959, 'train_samples_per_second': 30.692, 'train_steps_per_second': 0.48, 'total_flos': 2.602335381127373e+16, 'train_loss': 1.2928013553509135, 'epoch': 3.0})

In [None]:
model2_to_save = trainer.save_model(model_dir2)

##### 2-2 模型评估

In [14]:
def prepare_validation_features(examples):
    examples['question'] = [q.lstrip() for q in examples['question']]
    tokenized_examples = tokenizer2(
        examples['question' if pad_on_right else 'context'],
        examples['context' if pad_on_right else 'question'],
        truncation='only_second' if pad_on_right else 'only_first',
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding='max_length',
    )
    sample_mapping = tokenized_examples.pop('overflow_to_sample_mapping')
    tokenized_examples['example_id'] = []
    for i in range(len(tokenized_examples['input_ids'])):
        seq_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0
        sample_index = sample_mapping[i]
        tokenized_examples['example_id'].append(examples['id'][sample_index])
        tokenized_examples['offset_mapping'][i] = [
            (o if seq_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples['offset_mapping'][i])
        ]
    return tokenized_examples

In [15]:
validation_features = dataset2['validation'].map(
    prepare_validation_features,
    batched=True,
    remove_columns=dataset2['validation'].column_names
)

In [16]:
raw_predictions = trainer2.predict(validation_features)

In [17]:
validation_features.set_format(
    type=validation_features.format['type'],
    columns=list(validation_features.features.keys())
)

In [18]:
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size=20, max_answer_length=30):
    all_start_logits, all_end_logits = raw_predictions
    example_id_to_index = {k: i for i, k in enumerate(examples['id'])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature['example_id']]].append(i)
    
    predictions = collections.OrderedDict()
    print(f'正在后处理 {len(examples)} 个示例的预测，这些预测分散在 {len(features)} 个特征中。')

    for example_index, example in enumerate(tqdm(examples)):
        feature_indices = features_per_example[example_index]
        min_null_score = None  # 仅在 squad_v2 为True时使用
        valid_answers = []
        context = example['context']
        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            offset_mapping = features[feature_index]['offset_mapping']
            
            cls_index = features[feature_index]['input_ids'].index(tokenizer2.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score
            
            start_indexes = np.argsort(start_logits)[-1:-n_best_size-1:-1].tolist()
            end_indexes = np.argsort(end_logits)[-1:-n_best_size-1:-1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index >= len(offset_mapping) 
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None 
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append({
                        'text': context[start_char: end_char],
                        'score': start_logits[start_index] + end_logits[end_index]
                    })
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x['score'], reverse=True)[0]
        else:
            best_answer = {'text': '', 'score': 0.0}
        
        if not squad_v2:
            predictions[example['id']] = best_answer['text']
        else:
            answer = best_answer['text'] if best_answer['score'] > min_null_score else ''
            predictions[example['id']] = answer
    return predictions


In [19]:
final_predictions = postprocess_qa_predictions(
    dataset2['validation'], 
    validation_features, 
    raw_predictions.predictions
)

正在后处理 10570 个示例的预测，这些预测分散在 10784 个特征中。


  0%|          | 0/10570 [00:00<?, ?it/s]

In [20]:
metric = load_metric('squad_v2' if squad_v2 else 'squad')

  metric = load_metric('squad_v2' if squad_v2 else 'squad')
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

In [21]:
if squad_v2:
    formatted_predictions = [
        {'id': k, 'prediction_text': v, 'no_answer_probability': 0.0} 
        for k, v in final_predictions.items()
    ]
else:
    formatted_predictions = [{'id': k, 'prediction_text': v} for k, v in final_predictions.items()]
references = [{'id': ex['id'], 'answers': ex['answers']} for ex in dataset2['validation']]

metric.compute(predictions=formatted_predictions, references=references)

{'exact_match': 74.94796594134343, 'f1': 83.8309626918993}

##### 2-3 加载本地模型，再训练，再评估

In [12]:
trained_model2 = AutoModelForQuestionAnswering.from_pretrained(model_dir2)

In [22]:
trained_trainer2 = Trainer(
    trained_model2,
    args2,
    train_dataset=tokenized_datasets2['train'],
    eval_dataset=tokenized_datasets2['validation'],
    data_collator=default_data_collator,
    tokenizer=tokenizer2
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [23]:
trained_trainer2.train(resume_from_checkpoint=True)

Epoch,Training Loss,Validation Loss
1,0.8316,1.235142
2,0.6997,1.247907
3,0.673,1.256765


TrainOutput(global_step=4152, training_loss=0.7396820652691615, metrics={'train_runtime': 8648.9923, 'train_samples_per_second': 30.706, 'train_steps_per_second': 0.48, 'total_flos': 2.602335381127373e+16, 'train_loss': 0.7396820652691615, 'epoch': 3.0})

In [24]:
trained_trainer2.save_model(model_dir2)

In [25]:
trained_raw_predictions = trained_trainer2.predict(validation_features)

In [26]:
validation_features.set_format(
    type=validation_features.format['type'],
    columns=list(validation_features.features.keys())
)

In [27]:
final_predictions = postprocess_qa_predictions(
    dataset2['validation'], 
    validation_features, 
    trained_raw_predictions.predictions
)

正在后处理 10570 个示例的预测，这些预测分散在 10784 个特征中。


  0%|          | 0/10570 [00:00<?, ?it/s]

In [28]:
if squad_v2:
    formatted_predictions = [
        {'id': k, 'prediction_text': v, 'no_answer_probability': 0.0} 
        for k, v in final_predictions.items()
    ]
else:
    formatted_predictions = [{'id': k, 'prediction_text': v} for k, v in final_predictions.items()]
references = [{'id': ex['id'], 'answers': ex['answers']} for ex in dataset2['validation']]

metric.compute(predictions=formatted_predictions, references=references)

{'exact_match': 75.55345316934721, 'f1': 84.2860083518279}