Skip to content

Commit

Permalink
[NeuralChat] support evaluation perplexity during training. (#1319)
Browse files Browse the repository at this point in the history
  • Loading branch information
lkk12014402 committed Feb 28, 2024
1 parent 98bfcf8 commit 2858ed1
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 1 deletion.
72 changes: 71 additions & 1 deletion intel_extension_for_transformers/llm/finetuning/eval_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,12 @@
import nltk
import numpy as np
import torch
from torch.utils.data import DataLoader
from torch.utils.data import DataLoader, Dataset
from typing import Dict, List, Optional, Tuple, Union
import time
from transformers.trainer_utils import speed_metrics
from transformers.debug_utils import DebugOption
import math

@torch.no_grad()
def compute_rouge_metric(model, tokenizer, eval_dataset, training_args, gen_kwargs):
Expand Down Expand Up @@ -92,3 +97,68 @@ def postprocess_text(preds, labels):
result = metric.compute(use_stemmer=True)
result = {k: round(v * 100, 4) for k, v in result.items()}
return result

def evaluate_plus_ppl(
self,
eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
ignore_keys: Optional[List[str]] = None,
metric_key_prefix: str = "eval",
) -> Dict[str, float]:
"""
Copied from Trainer.evaluate:
https://github.com/huggingface/transformers/blob/v4.34.1/src/transformers/trainer.py#L3029
The only differences are:
- add new metric eval_ppl
"""
# handle multiple eval datasets
eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
if isinstance(eval_dataset, dict):
metrics = {}
for eval_dataset_name, _eval_dataset in eval_dataset.items():
dataset_metrics = self.evaluate(
eval_dataset=_eval_dataset,
ignore_keys=ignore_keys,
metric_key_prefix=f"{metric_key_prefix}_{eval_dataset_name}",
)
metrics.update(dataset_metrics)
return metrics

# memory metrics - must set up as early as possible
self._memory_tracker.start()

eval_dataloader = self.get_eval_dataloader(eval_dataset)

start_time = time.time()

eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
output = eval_loop(
eval_dataloader,
description="Evaluation",
# No point gathering the predictions if there are no metrics, otherwise we defer to
# self.args.prediction_loss_only
prediction_loss_only=True if self.compute_metrics is None else None,
ignore_keys=ignore_keys,
metric_key_prefix=metric_key_prefix,
)

total_batch_size = self.args.eval_batch_size * self.args.world_size
if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:
start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
output.metrics.update(
speed_metrics(
metric_key_prefix,
start_time,
num_samples=output.num_samples,
num_steps=math.ceil(output.num_samples / total_batch_size),
)
)

output.metrics[f"{metric_key_prefix}_ppl"] = math.exp(output.metrics[f"{metric_key_prefix}_loss"])

self.log(output.metrics)

self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, output.metrics)

self._memory_tracker.stop_and_update_metrics(output.metrics)

return output.metrics
15 changes: 15 additions & 0 deletions intel_extension_for_transformers/llm/finetuning/finetuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,10 @@ def finetune(self):
f"full finetune only support 16 and 32 bits."
)

if finetune_args.eval_ppl:
from .eval_utils import evaluate_plus_ppl
Trainer.evaluate = evaluate_plus_ppl

config = self.load_model_config(self.model_args)
if config.architectures[0].endswith("ForCausalLM") \
or config.architectures[0].endswith("QWenLMHeadModel"):
Expand Down Expand Up @@ -563,6 +567,17 @@ def concatenate_data(dataset, max_seq_length):

trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
trainer.save_model()

# Evaluation
if training_args.do_eval:
self.logger.info("*** Evaluate After Training***")
metrics = trainer.evaluate()
max_eval_samples = data_args.max_eval_samples \
if data_args.max_eval_samples is not None else len(eval_dataset)
metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

if finetune_args.do_lm_eval and finetune_args.task == "code-generation":
tokenizer.padding_side = "right" # padding on the right is needed to cut off padding in `complete_code`
tokenizer.truncation_side = "left"
Expand Down
4 changes: 4 additions & 0 deletions intel_extension_for_transformers/neural_chat/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,10 @@ class FinetuningArguments:
"choices": ["completion", "chat", "summarization", "code-generation"]
},
)
eval_ppl: bool = field(
default=True,
metadata={"help": "whether to compute evaluation perplexity during training."},
)
do_lm_eval: bool = field(
default=False,
metadata={"help": "whether to run the LM evaluation with EleutherAI/lm-evaluation-harness"},
Expand Down

0 comments on commit 2858ed1

Please sign in to comment.