From 24591a7160cd3af83a11ca2897dd1d7328bb0e0b Mon Sep 17 00:00:00 2001 From: Oleg S <97077423+RobotSail@users.noreply.github.com> Date: Fri, 8 Nov 2024 17:07:47 +0000 Subject: [PATCH] feat: add total_samples as a field to logs being emitted Today we emit a number of important values in the logs, but we do not explicitly announce how many samples we are planning on iterating through. This commit emits total_samples as a field in both the initial log as well as the subsequent logs during training. This way, this value can be accessed consistently by any program interested in displaying information about its progress for convenience. Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com> --- src/instructlab/training/main_ds.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py index 7910c341..5eabb384 100644 --- a/src/instructlab/training/main_ds.py +++ b/src/instructlab/training/main_ds.py @@ -18,6 +18,7 @@ # pylint: disable=no-name-in-module from instructlab.dolomite.hf_models import GPTDolomiteForCausalLM +from torch.utils.data import DataLoader from tqdm import tqdm from transformers import AutoModelForCausalLM, get_scheduler import torch @@ -325,7 +326,7 @@ def train( lr_scheduler, accelerator: Accelerator, tokenizer, - train_loader, + train_loader: DataLoader, grad_accum, metric_logger, ): @@ -457,6 +458,7 @@ def train( "total_loss": float(log_loss / num_loss_counted_tokens), "samples_seen": samples_seen, "gradnorm": global_grad_norm, + "total_samples": len(train_loader.dataset), # "weight_norm": weight_norm, } ) @@ -620,6 +622,7 @@ def main(args): "num_batches": len(train_loader), "avg_samples_per_batch": len(dataset) / len(train_loader), "samples_per_gpu": args.samples_per_gpu, + "total_samples": len(dataset), # emit the total number of samples } )