From 24591a7160cd3af83a11ca2897dd1d7328bb0e0b Mon Sep 17 00:00:00 2001
From: Oleg S <97077423+RobotSail@users.noreply.github.com>
Date: Fri, 8 Nov 2024 17:07:47 +0000
Subject: [PATCH] feat: add total_samples as a field to logs being emitted

Today we emit a number of important values in the logs, but we do not explicitly announce how many samples
we are planning on iterating through. This commit emits total_samples as a field in both the initial log
as well as the subsequent logs during training. This way, this value can be accessed consistently by
any program interested in displaying information about its progress for convenience.

Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com>
---
 src/instructlab/training/main_ds.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py
index 7910c341..5eabb384 100644
--- a/src/instructlab/training/main_ds.py
+++ b/src/instructlab/training/main_ds.py
@@ -18,6 +18,7 @@
 
 # pylint: disable=no-name-in-module
 from instructlab.dolomite.hf_models import GPTDolomiteForCausalLM
+from torch.utils.data import DataLoader
 from tqdm import tqdm
 from transformers import AutoModelForCausalLM, get_scheduler
 import torch
@@ -325,7 +326,7 @@ def train(
     lr_scheduler,
     accelerator: Accelerator,
     tokenizer,
-    train_loader,
+    train_loader: DataLoader,
     grad_accum,
     metric_logger,
 ):
@@ -457,6 +458,7 @@ def train(
                         "total_loss": float(log_loss / num_loss_counted_tokens),
                         "samples_seen": samples_seen,
                         "gradnorm": global_grad_norm,
+                        "total_samples": len(train_loader.dataset),
                         # "weight_norm": weight_norm,
                     }
                 )
@@ -620,6 +622,7 @@ def main(args):
                 "num_batches": len(train_loader),
                 "avg_samples_per_batch": len(dataset) / len(train_loader),
                 "samples_per_gpu": args.samples_per_gpu,
+                "total_samples": len(dataset),  # emit the total number of samples
             }
         )