## Logging

### Simple Recipe for Logging with Tensorboard in PyTorch

In [1]:
from datetime import datetime
from torch.utils.tensorboard import SummaryWriter

# Before training
job_name = f"gpt2-training-124M-{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}"
data_dir = "/tmp/data/"
log_dir = os.path.join(data_dir, "output/tensorboard", job_name)
writer = SummaryWriter(log_dir=log_dir)

# Within training loop
writer.add_scalar("Loss/train", losses["train"], step)
writer.add_scalar("Loss/eval", losses["eval"], step)
writer.add_scalar("learning_rate", lr, step)

# After training
hparam_dict = {
    "max_learning_rate": max_learning_rate,
    "max_steps": max_steps,
    "warmup_steps": warmup_steps,
    "min_learning_rate": min_learning_rate,
    "lr_decay_steps": lr_decay_steps,
    "batch_size": batch_size,
    "weight_decay": weight_decay,
    "gradient_accumulation_steps": gradient_accumulation_steps,
    "grad_clip": grad_clip,
    "beta1": beta1,
    "beta2": beta2,
    "n_layer": n_layer,
    "n_embd": n_embd,
    "n_head": n_head,
    "context_size": context_size,
    "vocab_size": vocab_size,
    "total_training_tokens": total_training_tokens,
    "n_times_through_data": total_training_tokens / total_training_tokens_unique,
    "n_params": n_params
}
metric_dict = {"hparam/loss": best_eval_loss.item()}
writer.add_hparams(hparam_dict=hparam_dict, metric_dict=metric_dict)
writer.flush()

### Additional Items to Log

In [2]:
def log_gradients(model, writer, step, prefix="grad"):
    for pn, p in model.named_parameters():
        if p.grad is not None and p.dim() >= 2:
            writer.add_histogram(f"{pn}/{prefix}", p.grad.float().cpu(), step)

def log_weights(model, writer, step, prefix="parameter"):
    for pn, p in model.named_parameters():
        if p.grad is not None:
            writer.add_histogram(f"{pn}/{prefix}", p.float().cpu(), step)


def log_norms(model, writer, step, prefix="norm"):
    for pn, p in model.named_parameters():
        if p.grad is not None:
            writer.add_histogram(f"{pn}/{prefix}_wt", torch.norm(p).float().cpu(), step)
            writer.add_histogram(f"{pn}/{prefix}_grad", torch.norm(p.grad).float().cpu(), step)

def log_minmax(model, writer, step):
    for pn, p in model.named_parameters():
        if (p.grad is not None) and (p.dim() >= 2):
            writer.add_histogram(f"{pn}/min", torch.min(p).float().cpu(), step)
            writer.add_histogram(f"{pn}/max", torch.max(p).float().cpu(), step)