In [1]:
import optuna
import nemo
import nemo.collections.nlp as nemo_nlp
from nemo.utils.exp_manager import exp_manager
import torch.multiprocessing as mp
from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder

In [2]:
import os
import tempfile
import sys

In [3]:
from omegaconf.omegaconf import OmegaConf, open_dict
from nemo.utils import logging
from nemo.utils.exp_manager import exp_manager
from nemo.core.config import hydra_runner

In [4]:
from hydra import initialize, compose

In [5]:
mp.set_start_method("spawn", force=True)

In [6]:
# @hydra_runner(config_path="/global/scratch/users/ksevegnani/nemo_test", config_name="llama2_7b.yaml")
# def get_config(cfg):
#     print(f'\n{OmegaConf.to_yaml(cfg)}')
#     return cfg

In [7]:
def get_config():
    with initialize(version_base=None, config_path="nemo_test"):
        cfg = compose(config_name="llama2_7b_optuna.yaml")
    print(f'\n{OmegaConf.to_yaml(cfg)}')
    return cfg

In [8]:
def initialize_model(cfg, trainer):
    logging.info("\n\n************** Experiment configuration ***********")
    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
    
    # Continual training
    if cfg.model.get("restore_from_path") is not None:
        # Option 1: Restore only the model weights from a .nemo file
        logging.info(f"Continual training: loading weights from {cfg.model.restore_from_path}")
        model = MegatronGPTModel.restore_from(
            restore_path=cfg.model.restore_from_path,
            override_config_path=cfg.model,
            trainer=trainer,
            save_restore_connector=NLPSaveRestoreConnector(),
        )
    elif cfg.model.get("restore_from_ckpt") is not None:
        # Option 2: Restore both model weights and optimizer states from a PTL checkpoint
        logging.info(f"Continual training: loading weights and optimizer states from {cfg.model.restore_from_ckpt}")
        trainer.ckpt_path = Path(cfg.model.restore_from_ckpt)
        model = MegatronGPTModel(cfg.model, trainer)
    else:
        # Start new pretraining or resume from a checkpoint if it exists
        model = MegatronGPTModel(cfg.model, trainer)
    
    return model

In [9]:
# Define the objective function for Optuna
def objective(trial):
    cfg = get_config()
    
    # Suggest hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-5, 1e-2, log=True)
    warmup_ratio = trial.suggest_float("warmup_ratio", 0.0, 0.2)

    

    trainer = MegatronTrainerBuilder(cfg).create_trainer()
    exp_manager(trainer, cfg.exp_manager)

    # Load the pre-trained Llama 2 model
    model = initialize_model(cfg, trainer)

    # Configure the model with suggested hyperparameters
    model.cfg.optim.lr = learning_rate
    model.cfg.optim.weight_decay = weight_decay
    model.cfg.optim.sched.warmup_ratio = warmup_ratio

    # Set up the trainer
    #     trainer = pl.Trainer(
    #         max_epochs=3,
    #         gpus=1,
    #         precision=16,
    #         amp_level='O2',
    #         accelerator="gpu",
    #         strategy="ddp",
    #         log_every_n_steps=10,
    #         val_check_interval=0.5,
    #     )


        # Set up the experiment manager
    #         exp_manager(
    #             trainer,
    #             exp_dir="optuna_experiments",
    #             create_tensorboard_logger=True,
    #             create_wandb_logger=False,
    #         )

    # Fine-tune the model
    trainer.fit(model)

    # Return the validation loss as the objective value
    return trainer.callback_metrics['val_loss'].item()



In [10]:
# Create an Optuna study
study = optuna.create_study(direction="minimize")

[I 2024-10-15 07:05:29,367] A new study created in memory with name: no-name-61cd6ea9-a1f2-4314-bff1-726557949040


In [11]:
# Optimize the hyperparameters
study.optimize(objective, n_trials=3, gc_after_trial=True)


run:
  name: llama2_7b
  results_dir: /global/scratch/users/ksevegnani/nemo_test/out
  time_limit: 0-01:30:00
  dependency: singleton
trainer:
  num_nodes: 1
  devices: 1
  accelerator: gpu
  precision: bf16
  logger: false
  enable_checkpointing: false
  use_distributed_sampler: false
  max_epochs: null
  max_steps: 100
  max_time: 05:23:30:00
  log_every_n_steps: 10
  val_check_interval: 100
  limit_val_batches: 32
  limit_test_batches: 50
  accumulate_grad_batches: 1
  gradient_clip_val: 1.0
exp_manager:
  explicit_log_dir: ${run.results_dir}/results
  exp_dir: null
  name: megatron_llama
  create_wandb_logger: false
  wandb_logger_kwargs:
    project: nemo_llama_pretrain
    name: ${run.name}
  resume_if_exists: false
  resume_ignore_no_checkpoint: true
  create_checkpoint_callback: true
  checkpoint_callback_params:
    monitor: val_loss
    save_top_k: 10
    mode: min
    always_save_nemo: false
    save_nemo_on_train_end: false
    filename: megatron_llama--{val_loss:.2f}-{ste

[NeMo W 2024-10-15 07:05:29 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/_graveyard/precision.py:49: The `MixedPrecisionPlugin` is deprecated. Use `pytorch_lightning.plugins.precision.MixedPrecision` instead.
    


[NeMo I 2024-10-15 07:05:29 dist_ckpt_io:320] Using ('zarr', 1) dist-ckpt save strategy.


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


[NeMo I 2024-10-15 07:05:29 exp_manager:341] ExpManager schema
[NeMo I 2024-10-15 07:05:29 exp_manager:342] {'explicit_log_dir': None, 'exp_dir': None, 'name': None, 'version': None, 'use_datetime_version': True, 'resume_if_exists': False, 'resume_past_end': False, 'resume_ignore_no_checkpoint': False, 'resume_from_checkpoint': None, 'create_tensorboard_logger': True, 'summary_writer_kwargs': None, 'create_wandb_logger': False, 'wandb_logger_kwargs': None, 'create_mlflow_logger': False, 'mlflow_logger_kwargs': {'experiment_name': None, 'tracking_uri': None, 'tags': None, 'save_dir': './mlruns', 'prefix': '', 'artifact_location': None, 'run_id': None, 'log_model': False}, 'create_dllogger_logger': False, 'dllogger_logger_kwargs': {'verbose': False, 'stdout': False, 'json_file': './dllogger.json'}, 'create_clearml_logger': False, 'clearml_logger_kwargs': {'project': None, 'task': None, 'connect_pytorch': False, 'model_name': None, 'tags': None, 'log_model': False, 'log_cfg': False, 'log_

[NeMo W 2024-10-15 07:05:30 exp_manager:712] Exp_manager is logging to /global/scratch/users/ksevegnani/nemo_test/out/results, but it already exists.


[NeMo I 2024-10-15 07:05:30 exp_manager:400] Experiments will be logged at /global/scratch/users/ksevegnani/nemo_test/out/results
[NeMo I 2024-10-15 07:05:30 exp_manager:860] TensorboardLogger has been set up


[NeMo W 2024-10-15 07:05:30 exp_manager:970] The checkpoint callback was told to monitor a validation value and trainer's max_steps was set to 100. Please ensure that max_steps will run for at least 1 epochs to ensure that checkpointing will not error out.


[NeMo I 2024-10-15 07:05:30 3748006741:2] 
    
    ************** Experiment configuration ***********
[NeMo I 2024-10-15 07:05:30 3748006741:3] 
    run:
      name: llama2_7b
      results_dir: /global/scratch/users/ksevegnani/nemo_test/out
      time_limit: 0-01:30:00
      dependency: singleton
    trainer:
      num_nodes: 1
      devices: 1
      accelerator: gpu
      precision: bf16
      logger: false
      enable_checkpointing: false
      use_distributed_sampler: false
      max_epochs: null
      max_steps: 100
      max_time: 05:23:30:00
      log_every_n_steps: 10
      val_check_interval: 100
      limit_val_batches: 32
      limit_test_batches: 50
      accumulate_grad_batches: 1
      gradient_clip_val: 1.0
    exp_manager:
      explicit_log_dir: ${run.results_dir}/results
      exp_dir: null
      name: megatron_llama
      create_wandb_logger: false
      wandb_logger_kwargs:
        project: nemo_llama_pretrain
        name: ${run.name}
      resume_if_exists: fal

[NeMo W 2024-10-15 07:05:30 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:05:30 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:05:30 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: moe_extended_tp in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:05:30 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:05:30 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: deterministic_mode in its c

[NeMo I 2024-10-15 07:05:30 megatron_init:263] Rank 0 has data parallel group : [0]
[NeMo I 2024-10-15 07:05:30 megatron_init:269] Rank 0 has combined group of data parallel and context parallel : [0]
[NeMo I 2024-10-15 07:05:30 megatron_init:274] All data parallel group ranks with context parallel combined: [[0]]
[NeMo I 2024-10-15 07:05:30 megatron_init:277] Ranks 0 has data parallel rank: 0
[NeMo I 2024-10-15 07:05:30 megatron_init:285] Rank 0 has context parallel group: [0]
[NeMo I 2024-10-15 07:05:30 megatron_init:288] All context parallel group ranks: [[0]]
[NeMo I 2024-10-15 07:05:30 megatron_init:289] Ranks 0 has context parallel rank: 0
[NeMo I 2024-10-15 07:05:30 megatron_init:296] Rank 0 has model parallel group: [0]
[NeMo I 2024-10-15 07:05:30 megatron_init:297] All model parallel group ranks: [[0]]
[NeMo I 2024-10-15 07:05:30 megatron_init:306] Rank 0 has tensor model parallel group: [0]
[NeMo I 2024-10-15 07:05:30 megatron_init:310] All tensor model parallel group ranks: 

[NeMo W 2024-10-15 07:05:30 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:05:30 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:05:30 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: moe_extended_tp in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:05:30 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:05:30 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: deterministic_mode in its c

[NeMo I 2024-10-15 07:05:31 tokenizer_utils:188] Getting SentencePiece with model: /global/scratch/users/ksevegnani/nemo_test/llama-tokenizer.model
[NeMo I 2024-10-15 07:05:31 megatron_base_model:584] Padded vocab_size: 32128, original vocab_size: 32003, dummy tokens: 125.


[NeMo W 2024-10-15 07:05:31 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:05:31 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:05:31 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: moe_extended_tp in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:05:31 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:05:31 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: deterministic_mode in its c

[NeMo W 2024-10-15 07:05:31 megatron_base_model:556] The model: MegatronGPTModel() does not have field.name: moe_z_loss_coeff in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:05:31 megatron_base_model:556] The model: MegatronGPTModel() does not have field.name: moe_input_jitter_eps in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:05:31 megatron_base_model:556] The model: MegatronGPTModel() does not have field.name: moe_token_dropping in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:05:31 megatron_base_model:556] The model: MegatronGPTModel() does not have field.name: moe_token_dispatcher_type in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:05:31 megatron_base_model:556] The model: MegatronGPTModel() does not have field.name: moe_per_layer_logging in its cfg. Add th

[NeMo I 2024-10-15 07:05:56 megatron_gpt_model:1592] Pipeline model parallel rank: 0, Tensor model parallel rank: 0, Number of model parameters on device: 4.69e+08. Number of precise model parameters on device: 469256192.
[NeMo I 2024-10-15 07:05:56 megatron_gpt_model:1446] Building GPT datasets.
[NeMo I 2024-10-15 07:05:56 utils:47] Let split_matrix = [(0, 0.9801980198019802), (0.9801980198019802, 0.9900990099009901), (0.9900990099009901, 1.0)]
[NeMo I 2024-10-15 07:05:56 utils:47] Building dataset splits with cls=GPTDataset, sizes=[100, 64, 50], and config=GPTDatasetConfig(random_seed=1234, sequence_length=4096, blend=(['/global/scratch/users/ksevegnani/nemo_test/pubmedqa_big_llama_input_document'], [1.0]), blend_per_split=None, split='99,1,1', split_matrix=[(0, 0.9801980198019802), (0.9801980198019802, 0.9900990099009901), (0.9900990099009901, 1.0)], num_dataset_builder_threads=1, path_to_cache=None, mmap_bin_files=True, mock=False, tokenizer=<nemo.collections.common.tokenizers.sent

[NeMo W 2024-10-15 07:05:57 utils:47] Building a BlendedDataset for a single MegatronDataset


[NeMo I 2024-10-15 07:05:57 utils:47] Build and save the BlendedDataset indices
[NeMo I 2024-10-15 07:05:57 utils:47] 	Build and save the dataset and dataset sample indexes


[NeMo W 2024-10-15 07:05:57 utils:47] Unable to save the BlendedDataset indexes because path_to_cache is None
[NeMo W 2024-10-15 07:05:57 utils:47] Building a BlendedDataset for a single MegatronDataset


[NeMo I 2024-10-15 07:05:57 utils:47] Build and save the BlendedDataset indices
[NeMo I 2024-10-15 07:05:57 utils:47] 	Build and save the dataset and dataset sample indexes


[NeMo W 2024-10-15 07:05:57 utils:47] Unable to save the BlendedDataset indexes because path_to_cache is None
[NeMo W 2024-10-15 07:05:57 utils:47] Building a BlendedDataset for a single MegatronDataset


[NeMo I 2024-10-15 07:05:57 utils:47] Build and save the BlendedDataset indices
[NeMo I 2024-10-15 07:05:57 utils:47] 	Build and save the dataset and dataset sample indexes


[NeMo W 2024-10-15 07:05:57 utils:47] Unable to save the BlendedDataset indexes because path_to_cache is None


[NeMo I 2024-10-15 07:05:57 megatron_gpt_model:1530] Length of train dataset: 101
[NeMo I 2024-10-15 07:05:57 megatron_gpt_model:1532] Length of val dataset: 65
[NeMo I 2024-10-15 07:05:57 megatron_gpt_model:1534] Length of test dataset: 51
[NeMo I 2024-10-15 07:05:57 megatron_gpt_model:1535] Finished building GPT datasets.
[NeMo I 2024-10-15 07:05:57 megatron_gpt_model:1636] Setting up train dataloader with len(len(self._train_ds)): 101 and consumed samples: 0
[NeMo I 2024-10-15 07:05:57 megatron_gpt_model:1544] Building dataloader with consumed samples: 0
[NeMo I 2024-10-15 07:05:57 data_samplers:76] Instantiating MegatronPretrainingSampler with total_samples: 101 and consumed_samples: 0
[NeMo I 2024-10-15 07:05:57 megatron_gpt_model:1644] Setting up validation dataloader with len(len(self._validation_ds)): 65 and consumed samples: 0
[NeMo I 2024-10-15 07:05:57 megatron_gpt_model:1544] Building dataloader with consumed samples: 0
[NeMo I 2024-10-15 07:05:57 data_samplers:76] Instanti

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[NeMo I 2024-10-15 07:05:57 modelPT:770] Optimizer config = MegatronDistributedFusedAdam (
    Parameter Group 0
        betas: [0.9, 0.95]
        bias_correction: True
        eps: 1e-08
        is_expert: False
        lr: 0.0003654503479685462
        weight_decay: 0.0009979655831599337
    )
[NeMo I 2024-10-15 07:05:57 lr_scheduler:923] Scheduler "<nemo.core.optim.lr_scheduler.CosineAnnealing object at 0x51869e1ab4f0>" 
    will be used during training (effective maximum steps = 100) - 
    Parameters : 
    (warmup_ratio: 0.1911978919333179
    constant_steps: 0
    min_lr: 1.0e-05
    max_steps: 100
    )



  | Name  | Type          | Params
----------------------------------------
0 | model | Float16Module | 469 M 
----------------------------------------
469 M     Trainable params
0         Non-trainable params
469 M     Total params
1,877.025 Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

[NeMo W 2024-10-15 07:05:58 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/utilities.py:149: Found `dataloader_iter` argument in the `validation_step`. Note that the support for this signature is experimental and the behavior is subject to change.
    
[NeMo W 2024-10-15 07:06:11 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('val_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
    
[NeMo W 2024-10-15 07:06:11 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/utilities.py:149: Found `dataloader_iter` argument in the `training_step`. Note that the support for this signature is experimental and the behavior is subject to change.
    


Training: |                                                                                                   …

[NeMo W 2024-10-15 07:06:24 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/utilities.py:149: Found `dataloader_iter` argument in the `validation_step`. Note that the support for this signature is experimental and the behavior is subject to change.
    


Validation: |                                                                                                 …

Epoch 0, global step 100: 'val_loss' reached 5.90896 (best 5.90896), saving model to '/global/scratch/users/ksevegnani/nemo_test/out/results/checkpoints/megatron_llama--val_loss=5.91-step=100-consumed_samples=100.0.ckpt' as top 10
`Trainer.fit` stopped: `max_steps=100` reached.
[I 2024-10-15 07:06:41,517] Trial 0 finished with value: 5.908960819244385 and parameters: {'learning_rate': 0.0003654503479685462, 'weight_decay': 0.0009979655831599337, 'warmup_ratio': 0.1911978919333179}. Best is trial 0 with value: 5.908960819244385.



run:
  name: llama2_7b
  results_dir: /global/scratch/users/ksevegnani/nemo_test/out
  time_limit: 0-01:30:00
  dependency: singleton
trainer:
  num_nodes: 1
  devices: 1
  accelerator: gpu
  precision: bf16
  logger: false
  enable_checkpointing: false
  use_distributed_sampler: false
  max_epochs: null
  max_steps: 100
  max_time: 05:23:30:00
  log_every_n_steps: 10
  val_check_interval: 100
  limit_val_batches: 32
  limit_test_batches: 50
  accumulate_grad_batches: 1
  gradient_clip_val: 1.0
exp_manager:
  explicit_log_dir: ${run.results_dir}/results
  exp_dir: null
  name: megatron_llama
  create_wandb_logger: false
  wandb_logger_kwargs:
    project: nemo_llama_pretrain
    name: ${run.name}
  resume_if_exists: false
  resume_ignore_no_checkpoint: true
  create_checkpoint_callback: true
  checkpoint_callback_params:
    monitor: val_loss
    save_top_k: 10
    mode: min
    always_save_nemo: false
    save_nemo_on_train_end: false
    filename: megatron_llama--{val_loss:.2f}-{ste

[NeMo W 2024-10-15 07:06:41 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/_graveyard/precision.py:49: The `MixedPrecisionPlugin` is deprecated. Use `pytorch_lightning.plugins.precision.MixedPrecision` instead.
    


[NeMo I 2024-10-15 07:06:41 dist_ckpt_io:320] Using ('zarr', 1) dist-ckpt save strategy.


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


[NeMo I 2024-10-15 07:06:41 exp_manager:341] ExpManager schema
[NeMo I 2024-10-15 07:06:41 exp_manager:342] {'explicit_log_dir': None, 'exp_dir': None, 'name': None, 'version': None, 'use_datetime_version': True, 'resume_if_exists': False, 'resume_past_end': False, 'resume_ignore_no_checkpoint': False, 'resume_from_checkpoint': None, 'create_tensorboard_logger': True, 'summary_writer_kwargs': None, 'create_wandb_logger': False, 'wandb_logger_kwargs': None, 'create_mlflow_logger': False, 'mlflow_logger_kwargs': {'experiment_name': None, 'tracking_uri': None, 'tags': None, 'save_dir': './mlruns', 'prefix': '', 'artifact_location': None, 'run_id': None, 'log_model': False}, 'create_dllogger_logger': False, 'dllogger_logger_kwargs': {'verbose': False, 'stdout': False, 'json_file': './dllogger.json'}, 'create_clearml_logger': False, 'clearml_logger_kwargs': {'project': None, 'task': None, 'connect_pytorch': False, 'model_name': None, 'tags': None, 'log_model': False, 'log_cfg': False, 'log_

[NeMo W 2024-10-15 07:06:41 exp_manager:712] Exp_manager is logging to /global/scratch/users/ksevegnani/nemo_test/out/results, but it already exists.


[NeMo I 2024-10-15 07:06:41 exp_manager:400] Experiments will be logged at /global/scratch/users/ksevegnani/nemo_test/out/results
[NeMo I 2024-10-15 07:06:41 exp_manager:860] TensorboardLogger has been set up


[NeMo W 2024-10-15 07:06:41 exp_manager:970] The checkpoint callback was told to monitor a validation value and trainer's max_steps was set to 100. Please ensure that max_steps will run for at least 1 epochs to ensure that checkpointing will not error out.


[NeMo I 2024-10-15 07:06:41 3748006741:2] 
    
    ************** Experiment configuration ***********
[NeMo I 2024-10-15 07:06:41 3748006741:3] 
    run:
      name: llama2_7b
      results_dir: /global/scratch/users/ksevegnani/nemo_test/out
      time_limit: 0-01:30:00
      dependency: singleton
    trainer:
      num_nodes: 1
      devices: 1
      accelerator: gpu
      precision: bf16
      logger: false
      enable_checkpointing: false
      use_distributed_sampler: false
      max_epochs: null
      max_steps: 100
      max_time: 05:23:30:00
      log_every_n_steps: 10
      val_check_interval: 100
      limit_val_batches: 32
      limit_test_batches: 50
      accumulate_grad_batches: 1
      gradient_clip_val: 1.0
    exp_manager:
      explicit_log_dir: ${run.results_dir}/results
      exp_dir: null
      name: megatron_llama
      create_wandb_logger: false
      wandb_logger_kwargs:
        project: nemo_llama_pretrain
        name: ${run.name}
      resume_if_exists: fal

[NeMo W 2024-10-15 07:06:41 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:06:41 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:06:41 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: moe_extended_tp in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:06:41 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:06:41 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: deterministic_mode in its c

[NeMo I 2024-10-15 07:06:41 megatron_init:263] Rank 0 has data parallel group : [0]
[NeMo I 2024-10-15 07:06:41 megatron_init:269] Rank 0 has combined group of data parallel and context parallel : [0]
[NeMo I 2024-10-15 07:06:41 megatron_init:274] All data parallel group ranks with context parallel combined: [[0]]
[NeMo I 2024-10-15 07:06:41 megatron_init:277] Ranks 0 has data parallel rank: 0
[NeMo I 2024-10-15 07:06:41 megatron_init:285] Rank 0 has context parallel group: [0]
[NeMo I 2024-10-15 07:06:41 megatron_init:288] All context parallel group ranks: [[0]]
[NeMo I 2024-10-15 07:06:41 megatron_init:289] Ranks 0 has context parallel rank: 0
[NeMo I 2024-10-15 07:06:41 megatron_init:296] Rank 0 has model parallel group: [0]
[NeMo I 2024-10-15 07:06:41 megatron_init:297] All model parallel group ranks: [[0]]
[NeMo I 2024-10-15 07:06:41 megatron_init:306] Rank 0 has tensor model parallel group: [0]
[NeMo I 2024-10-15 07:06:41 megatron_init:310] All tensor model parallel group ranks: 

[NeMo W 2024-10-15 07:06:41 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:06:41 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:06:41 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: moe_extended_tp in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:06:41 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:06:41 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: deterministic_mode in its c

[NeMo I 2024-10-15 07:06:41 tokenizer_utils:188] Getting SentencePiece with model: /global/scratch/users/ksevegnani/nemo_test/llama-tokenizer.model
[NeMo I 2024-10-15 07:06:41 megatron_base_model:584] Padded vocab_size: 32128, original vocab_size: 32003, dummy tokens: 125.


[NeMo W 2024-10-15 07:06:41 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:06:41 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:06:41 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: moe_extended_tp in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:06:41 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:06:41 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: deterministic_mode in its c

[NeMo W 2024-10-15 07:06:42 megatron_base_model:556] The model: MegatronGPTModel() does not have field.name: moe_z_loss_coeff in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:06:42 megatron_base_model:556] The model: MegatronGPTModel() does not have field.name: moe_input_jitter_eps in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:06:42 megatron_base_model:556] The model: MegatronGPTModel() does not have field.name: moe_token_dropping in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:06:42 megatron_base_model:556] The model: MegatronGPTModel() does not have field.name: moe_token_dispatcher_type in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:06:42 megatron_base_model:556] The model: MegatronGPTModel() does not have field.name: moe_per_layer_logging in its cfg. Add th

[NeMo I 2024-10-15 07:06:42 build_model:143]  > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 469256192


[NeMo W 2024-10-15 07:06:42 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/configuration_validator.py:181: You have overridden `MegatronGPTModel.configure_sharded_model` which is deprecated. Please override the `configure_model` hook instead. Instantiation with the newer hook will be created on the device right away and have the right data type depending on the precision setting in the Trainer.
    
[NeMo W 2024-10-15 07:06:42 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/configuration_validator.py:163: You are using the `dataloader_iter` step flavor. If you consume the iterator more than once per step, the `batch_idx` argument in any hook that takes it will not match with the batch index of the last batch consumed. This might have unforeseen effects on callbacks or code that expects to get the correct index. This will also not work well with gradient accumulation. This feature is very experimental and subject t

[NeMo I 2024-10-15 07:06:43 megatron_gpt_model:1592] Pipeline model parallel rank: 0, Tensor model parallel rank: 0, Number of model parameters on device: 4.69e+08. Number of precise model parameters on device: 469256192.
[NeMo I 2024-10-15 07:06:43 megatron_gpt_model:1446] Building GPT datasets.
[NeMo I 2024-10-15 07:06:43 utils:47] Let split_matrix = [(0, 0.9801980198019802), (0.9801980198019802, 0.9900990099009901), (0.9900990099009901, 1.0)]
[NeMo I 2024-10-15 07:06:43 utils:47] Building dataset splits with cls=GPTDataset, sizes=[100, 64, 50], and config=GPTDatasetConfig(random_seed=1234, sequence_length=4096, blend=(['/global/scratch/users/ksevegnani/nemo_test/pubmedqa_big_llama_input_document'], [1.0]), blend_per_split=None, split='99,1,1', split_matrix=[(0, 0.9801980198019802), (0.9801980198019802, 0.9900990099009901), (0.9900990099009901, 1.0)], num_dataset_builder_threads=1, path_to_cache=None, mmap_bin_files=True, mock=False, tokenizer=<nemo.collections.common.tokenizers.sent

[NeMo W 2024-10-15 07:06:43 utils:47] Building a BlendedDataset for a single MegatronDataset


[NeMo I 2024-10-15 07:06:43 utils:47] Build and save the BlendedDataset indices
[NeMo I 2024-10-15 07:06:43 utils:47] 	Build and save the dataset and dataset sample indexes


[NeMo W 2024-10-15 07:06:43 utils:47] Unable to save the BlendedDataset indexes because path_to_cache is None
[NeMo W 2024-10-15 07:06:43 utils:47] Building a BlendedDataset for a single MegatronDataset


[NeMo I 2024-10-15 07:06:43 utils:47] Build and save the BlendedDataset indices
[NeMo I 2024-10-15 07:06:43 utils:47] 	Build and save the dataset and dataset sample indexes


[NeMo W 2024-10-15 07:06:43 utils:47] Unable to save the BlendedDataset indexes because path_to_cache is None
[NeMo W 2024-10-15 07:06:43 utils:47] Building a BlendedDataset for a single MegatronDataset


[NeMo I 2024-10-15 07:06:43 utils:47] Build and save the BlendedDataset indices
[NeMo I 2024-10-15 07:06:43 utils:47] 	Build and save the dataset and dataset sample indexes


[NeMo W 2024-10-15 07:06:43 utils:47] Unable to save the BlendedDataset indexes because path_to_cache is None


[NeMo I 2024-10-15 07:06:43 megatron_gpt_model:1530] Length of train dataset: 101
[NeMo I 2024-10-15 07:06:43 megatron_gpt_model:1532] Length of val dataset: 65
[NeMo I 2024-10-15 07:06:43 megatron_gpt_model:1534] Length of test dataset: 51
[NeMo I 2024-10-15 07:06:43 megatron_gpt_model:1535] Finished building GPT datasets.
[NeMo I 2024-10-15 07:06:43 megatron_gpt_model:1636] Setting up train dataloader with len(len(self._train_ds)): 101 and consumed samples: 0
[NeMo I 2024-10-15 07:06:43 megatron_gpt_model:1544] Building dataloader with consumed samples: 0
[NeMo I 2024-10-15 07:06:43 data_samplers:76] Instantiating MegatronPretrainingSampler with total_samples: 101 and consumed_samples: 0
[NeMo I 2024-10-15 07:06:43 megatron_gpt_model:1644] Setting up validation dataloader with len(len(self._validation_ds)): 65 and consumed samples: 0
[NeMo I 2024-10-15 07:06:43 megatron_gpt_model:1544] Building dataloader with consumed samples: 0
[NeMo I 2024-10-15 07:06:43 data_samplers:76] Instanti

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[NeMo I 2024-10-15 07:06:43 modelPT:770] Optimizer config = MegatronDistributedFusedAdam (
    Parameter Group 0
        betas: [0.9, 0.95]
        bias_correction: True
        eps: 1e-08
        is_expert: False
        lr: 0.0003747454131290244
        weight_decay: 0.00018931221346043012
    )
[NeMo I 2024-10-15 07:06:43 lr_scheduler:923] Scheduler "<nemo.core.optim.lr_scheduler.CosineAnnealing object at 0x5189ffbff460>" 
    will be used during training (effective maximum steps = 100) - 
    Parameters : 
    (warmup_ratio: 0.1793158753109551
    constant_steps: 0
    min_lr: 1.0e-05
    max_steps: 100
    )



  | Name  | Type          | Params
----------------------------------------
0 | model | Float16Module | 469 M 
----------------------------------------
469 M     Trainable params
0         Non-trainable params
469 M     Total params
1,877.025 Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

[NeMo W 2024-10-15 07:06:44 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/utilities.py:149: Found `dataloader_iter` argument in the `validation_step`. Note that the support for this signature is experimental and the behavior is subject to change.
    
[NeMo W 2024-10-15 07:06:49 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/utilities.py:149: Found `dataloader_iter` argument in the `training_step`. Note that the support for this signature is experimental and the behavior is subject to change.
    


Training: |                                                                                                   …

Validation: |                                                                                                 …

Epoch 0, global step 100: 'val_loss' reached 5.99272 (best 5.99272), saving model to '/global/scratch/users/ksevegnani/nemo_test/out/results/checkpoints/megatron_llama--val_loss=5.99-step=100-consumed_samples=100.0.ckpt' as top 10
`Trainer.fit` stopped: `max_steps=100` reached.
[I 2024-10-15 07:07:14,463] Trial 1 finished with value: 5.992722988128662 and parameters: {'learning_rate': 0.0003747454131290244, 'weight_decay': 0.00018931221346043012, 'warmup_ratio': 0.1793158753109551}. Best is trial 0 with value: 5.908960819244385.



run:
  name: llama2_7b
  results_dir: /global/scratch/users/ksevegnani/nemo_test/out
  time_limit: 0-01:30:00
  dependency: singleton
trainer:
  num_nodes: 1
  devices: 1
  accelerator: gpu
  precision: bf16
  logger: false
  enable_checkpointing: false
  use_distributed_sampler: false
  max_epochs: null
  max_steps: 100
  max_time: 05:23:30:00
  log_every_n_steps: 10
  val_check_interval: 100
  limit_val_batches: 32
  limit_test_batches: 50
  accumulate_grad_batches: 1
  gradient_clip_val: 1.0
exp_manager:
  explicit_log_dir: ${run.results_dir}/results
  exp_dir: null
  name: megatron_llama
  create_wandb_logger: false
  wandb_logger_kwargs:
    project: nemo_llama_pretrain
    name: ${run.name}
  resume_if_exists: false
  resume_ignore_no_checkpoint: true
  create_checkpoint_callback: true
  checkpoint_callback_params:
    monitor: val_loss
    save_top_k: 10
    mode: min
    always_save_nemo: false
    save_nemo_on_train_end: false
    filename: megatron_llama--{val_loss:.2f}-{ste

[NeMo W 2024-10-15 07:07:14 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/_graveyard/precision.py:49: The `MixedPrecisionPlugin` is deprecated. Use `pytorch_lightning.plugins.precision.MixedPrecision` instead.
    


[NeMo I 2024-10-15 07:07:14 dist_ckpt_io:320] Using ('zarr', 1) dist-ckpt save strategy.


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


[NeMo I 2024-10-15 07:07:14 exp_manager:341] ExpManager schema
[NeMo I 2024-10-15 07:07:14 exp_manager:342] {'explicit_log_dir': None, 'exp_dir': None, 'name': None, 'version': None, 'use_datetime_version': True, 'resume_if_exists': False, 'resume_past_end': False, 'resume_ignore_no_checkpoint': False, 'resume_from_checkpoint': None, 'create_tensorboard_logger': True, 'summary_writer_kwargs': None, 'create_wandb_logger': False, 'wandb_logger_kwargs': None, 'create_mlflow_logger': False, 'mlflow_logger_kwargs': {'experiment_name': None, 'tracking_uri': None, 'tags': None, 'save_dir': './mlruns', 'prefix': '', 'artifact_location': None, 'run_id': None, 'log_model': False}, 'create_dllogger_logger': False, 'dllogger_logger_kwargs': {'verbose': False, 'stdout': False, 'json_file': './dllogger.json'}, 'create_clearml_logger': False, 'clearml_logger_kwargs': {'project': None, 'task': None, 'connect_pytorch': False, 'model_name': None, 'tags': None, 'log_model': False, 'log_cfg': False, 'log_

[NeMo W 2024-10-15 07:07:14 exp_manager:712] Exp_manager is logging to /global/scratch/users/ksevegnani/nemo_test/out/results, but it already exists.


[NeMo I 2024-10-15 07:07:14 exp_manager:400] Experiments will be logged at /global/scratch/users/ksevegnani/nemo_test/out/results
[NeMo I 2024-10-15 07:07:14 exp_manager:860] TensorboardLogger has been set up


[NeMo W 2024-10-15 07:07:14 exp_manager:970] The checkpoint callback was told to monitor a validation value and trainer's max_steps was set to 100. Please ensure that max_steps will run for at least 1 epochs to ensure that checkpointing will not error out.


[NeMo I 2024-10-15 07:07:14 3748006741:2] 
    
    ************** Experiment configuration ***********
[NeMo I 2024-10-15 07:07:14 3748006741:3] 
    run:
      name: llama2_7b
      results_dir: /global/scratch/users/ksevegnani/nemo_test/out
      time_limit: 0-01:30:00
      dependency: singleton
    trainer:
      num_nodes: 1
      devices: 1
      accelerator: gpu
      precision: bf16
      logger: false
      enable_checkpointing: false
      use_distributed_sampler: false
      max_epochs: null
      max_steps: 100
      max_time: 05:23:30:00
      log_every_n_steps: 10
      val_check_interval: 100
      limit_val_batches: 32
      limit_test_batches: 50
      accumulate_grad_batches: 1
      gradient_clip_val: 1.0
    exp_manager:
      explicit_log_dir: ${run.results_dir}/results
      exp_dir: null
      name: megatron_llama
      create_wandb_logger: false
      wandb_logger_kwargs:
        project: nemo_llama_pretrain
        name: ${run.name}
      resume_if_exists: fal

[NeMo W 2024-10-15 07:07:14 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:07:14 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:07:14 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: moe_extended_tp in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:07:14 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:07:14 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: deterministic_mode in its c

[NeMo I 2024-10-15 07:07:14 megatron_init:263] Rank 0 has data parallel group : [0]
[NeMo I 2024-10-15 07:07:14 megatron_init:269] Rank 0 has combined group of data parallel and context parallel : [0]
[NeMo I 2024-10-15 07:07:14 megatron_init:274] All data parallel group ranks with context parallel combined: [[0]]
[NeMo I 2024-10-15 07:07:14 megatron_init:277] Ranks 0 has data parallel rank: 0
[NeMo I 2024-10-15 07:07:14 megatron_init:285] Rank 0 has context parallel group: [0]
[NeMo I 2024-10-15 07:07:14 megatron_init:288] All context parallel group ranks: [[0]]
[NeMo I 2024-10-15 07:07:14 megatron_init:289] Ranks 0 has context parallel rank: 0
[NeMo I 2024-10-15 07:07:14 megatron_init:296] Rank 0 has model parallel group: [0]
[NeMo I 2024-10-15 07:07:14 megatron_init:297] All model parallel group ranks: [[0]]
[NeMo I 2024-10-15 07:07:14 megatron_init:306] Rank 0 has tensor model parallel group: [0]
[NeMo I 2024-10-15 07:07:14 megatron_init:310] All tensor model parallel group ranks: 

[NeMo W 2024-10-15 07:07:14 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:07:14 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:07:14 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: moe_extended_tp in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:07:14 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:07:14 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: deterministic_mode in its c

[NeMo I 2024-10-15 07:07:14 tokenizer_utils:188] Getting SentencePiece with model: /global/scratch/users/ksevegnani/nemo_test/llama-tokenizer.model
[NeMo I 2024-10-15 07:07:14 megatron_base_model:584] Padded vocab_size: 32128, original vocab_size: 32003, dummy tokens: 125.


[NeMo W 2024-10-15 07:07:14 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:07:14 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:07:14 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: moe_extended_tp in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:07:14 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:07:14 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: deterministic_mode in its c

[NeMo W 2024-10-15 07:07:14 megatron_base_model:556] The model: MegatronGPTModel() does not have field.name: moe_z_loss_coeff in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:07:14 megatron_base_model:556] The model: MegatronGPTModel() does not have field.name: moe_input_jitter_eps in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:07:14 megatron_base_model:556] The model: MegatronGPTModel() does not have field.name: moe_token_dropping in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:07:14 megatron_base_model:556] The model: MegatronGPTModel() does not have field.name: moe_token_dispatcher_type in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:07:14 megatron_base_model:556] The model: MegatronGPTModel() does not have field.name: moe_per_layer_logging in its cfg. Add th

[NeMo I 2024-10-15 07:07:14 build_model:143]  > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 469256192


[NeMo W 2024-10-15 07:07:14 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/configuration_validator.py:181: You have overridden `MegatronGPTModel.configure_sharded_model` which is deprecated. Please override the `configure_model` hook instead. Instantiation with the newer hook will be created on the device right away and have the right data type depending on the precision setting in the Trainer.
    
[NeMo W 2024-10-15 07:07:14 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/configuration_validator.py:163: You are using the `dataloader_iter` step flavor. If you consume the iterator more than once per step, the `batch_idx` argument in any hook that takes it will not match with the batch index of the last batch consumed. This might have unforeseen effects on callbacks or code that expects to get the correct index. This will also not work well with gradient accumulation. This feature is very experimental and subject t

[NeMo I 2024-10-15 07:07:16 megatron_gpt_model:1592] Pipeline model parallel rank: 0, Tensor model parallel rank: 0, Number of model parameters on device: 4.69e+08. Number of precise model parameters on device: 469256192.
[NeMo I 2024-10-15 07:07:16 megatron_gpt_model:1446] Building GPT datasets.
[NeMo I 2024-10-15 07:07:16 utils:47] Let split_matrix = [(0, 0.9801980198019802), (0.9801980198019802, 0.9900990099009901), (0.9900990099009901, 1.0)]
[NeMo I 2024-10-15 07:07:16 utils:47] Building dataset splits with cls=GPTDataset, sizes=[100, 64, 50], and config=GPTDatasetConfig(random_seed=1234, sequence_length=4096, blend=(['/global/scratch/users/ksevegnani/nemo_test/pubmedqa_big_llama_input_document'], [1.0]), blend_per_split=None, split='99,1,1', split_matrix=[(0, 0.9801980198019802), (0.9801980198019802, 0.9900990099009901), (0.9900990099009901, 1.0)], num_dataset_builder_threads=1, path_to_cache=None, mmap_bin_files=True, mock=False, tokenizer=<nemo.collections.common.tokenizers.sent

[NeMo W 2024-10-15 07:07:16 utils:47] Building a BlendedDataset for a single MegatronDataset


[NeMo I 2024-10-15 07:07:16 utils:47] Build and save the BlendedDataset indices
[NeMo I 2024-10-15 07:07:16 utils:47] 	Build and save the dataset and dataset sample indexes


[NeMo W 2024-10-15 07:07:16 utils:47] Unable to save the BlendedDataset indexes because path_to_cache is None
[NeMo W 2024-10-15 07:07:16 utils:47] Building a BlendedDataset for a single MegatronDataset


[NeMo I 2024-10-15 07:07:16 utils:47] Build and save the BlendedDataset indices
[NeMo I 2024-10-15 07:07:16 utils:47] 	Build and save the dataset and dataset sample indexes


[NeMo W 2024-10-15 07:07:16 utils:47] Unable to save the BlendedDataset indexes because path_to_cache is None
[NeMo W 2024-10-15 07:07:16 utils:47] Building a BlendedDataset for a single MegatronDataset


[NeMo I 2024-10-15 07:07:16 utils:47] Build and save the BlendedDataset indices
[NeMo I 2024-10-15 07:07:16 utils:47] 	Build and save the dataset and dataset sample indexes


[NeMo W 2024-10-15 07:07:16 utils:47] Unable to save the BlendedDataset indexes because path_to_cache is None


[NeMo I 2024-10-15 07:07:16 megatron_gpt_model:1530] Length of train dataset: 101
[NeMo I 2024-10-15 07:07:16 megatron_gpt_model:1532] Length of val dataset: 65
[NeMo I 2024-10-15 07:07:16 megatron_gpt_model:1534] Length of test dataset: 51
[NeMo I 2024-10-15 07:07:16 megatron_gpt_model:1535] Finished building GPT datasets.
[NeMo I 2024-10-15 07:07:16 megatron_gpt_model:1636] Setting up train dataloader with len(len(self._train_ds)): 101 and consumed samples: 0
[NeMo I 2024-10-15 07:07:16 megatron_gpt_model:1544] Building dataloader with consumed samples: 0
[NeMo I 2024-10-15 07:07:16 data_samplers:76] Instantiating MegatronPretrainingSampler with total_samples: 101 and consumed_samples: 0
[NeMo I 2024-10-15 07:07:16 megatron_gpt_model:1644] Setting up validation dataloader with len(len(self._validation_ds)): 65 and consumed samples: 0
[NeMo I 2024-10-15 07:07:16 megatron_gpt_model:1544] Building dataloader with consumed samples: 0
[NeMo I 2024-10-15 07:07:16 data_samplers:76] Instanti

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[NeMo I 2024-10-15 07:07:16 modelPT:770] Optimizer config = MegatronDistributedFusedAdam (
    Parameter Group 0
        betas: [0.9, 0.95]
        bias_correction: True
        eps: 1e-08
        is_expert: False
        lr: 0.0009889393275289025
        weight_decay: 0.0002050977932188532
    )
[NeMo I 2024-10-15 07:07:16 lr_scheduler:923] Scheduler "<nemo.core.optim.lr_scheduler.CosineAnnealing object at 0x518993e1d540>" 
    will be used during training (effective maximum steps = 100) - 
    Parameters : 
    (warmup_ratio: 0.1530318683315003
    constant_steps: 0
    min_lr: 1.0e-05
    max_steps: 100
    )



  | Name  | Type          | Params
----------------------------------------
0 | model | Float16Module | 469 M 
----------------------------------------
469 M     Trainable params
0         Non-trainable params
469 M     Total params
1,877.025 Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

[NeMo W 2024-10-15 07:07:17 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/utilities.py:149: Found `dataloader_iter` argument in the `validation_step`. Note that the support for this signature is experimental and the behavior is subject to change.
    
[NeMo W 2024-10-15 07:07:22 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/utilities.py:149: Found `dataloader_iter` argument in the `training_step`. Note that the support for this signature is experimental and the behavior is subject to change.
    


Training: |                                                                                                   …

Validation: |                                                                                                 …

Epoch 0, global step 100: 'val_loss' reached 6.15920 (best 6.15920), saving model to '/global/scratch/users/ksevegnani/nemo_test/out/results/checkpoints/megatron_llama--val_loss=6.16-step=100-consumed_samples=100.0.ckpt' as top 10
`Trainer.fit` stopped: `max_steps=100` reached.
[I 2024-10-15 07:07:51,463] Trial 2 finished with value: 6.159204006195068 and parameters: {'learning_rate': 0.0009889393275289025, 'weight_decay': 0.0002050977932188532, 'warmup_ratio': 0.1530318683315003}. Best is trial 0 with value: 5.908960819244385.


In [12]:
# Print the best hyperparameters and corresponding validation loss
print("Best hyperparameters:", study.best_params)
print("Best validation loss:", study.best_value)

Best hyperparameters: {'learning_rate': 0.0003654503479685462, 'weight_decay': 0.0009979655831599337, 'warmup_ratio': 0.1911978919333179}
Best validation loss: 5.908960819244385


In [13]:
cfg = get_config()

# setting custom values
cfg.trainer.max_epochs = 10
cfg.trainer.devices = 1
cfg.trainer.precision = 16
cfg.trainer.accelerator = "gpu"
cfg.trainer.log_every_n_steps = 10
cfg.trainer.val_check_interval = 0.5


best_trainer = MegatronTrainerBuilder(cfg).create_trainer()

# best_trainer = pl.Trainer(
#     max_epochs=10,
#     gpus=1,
#     precision=16,
#     amp_level='O2',
#     accelerator="gpu",
#     strategy="ddp",
#     log_every_n_steps=10,
#     val_check_interval=0.5,
# )


run:
  name: llama2_7b
  results_dir: /global/scratch/users/ksevegnani/nemo_test/out
  time_limit: 0-01:30:00
  dependency: singleton
trainer:
  num_nodes: 1
  devices: 1
  accelerator: gpu
  precision: bf16
  logger: false
  enable_checkpointing: false
  use_distributed_sampler: false
  max_epochs: null
  max_steps: 100
  max_time: 05:23:30:00
  log_every_n_steps: 10
  val_check_interval: 100
  limit_val_batches: 32
  limit_test_batches: 50
  accumulate_grad_batches: 1
  gradient_clip_val: 1.0
exp_manager:
  explicit_log_dir: ${run.results_dir}/results
  exp_dir: null
  name: megatron_llama
  create_wandb_logger: false
  wandb_logger_kwargs:
    project: nemo_llama_pretrain
    name: ${run.name}
  resume_if_exists: false
  resume_ignore_no_checkpoint: true
  create_checkpoint_callback: true
  checkpoint_callback_params:
    monitor: val_loss
    save_top_k: 10
    mode: min
    always_save_nemo: false
    save_nemo_on_train_end: false
    filename: megatron_llama--{val_loss:.2f}-{ste

[NeMo W 2024-10-15 07:07:51 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/_graveyard/precision.py:49: The `MixedPrecisionPlugin` is deprecated. Use `pytorch_lightning.plugins.precision.MixedPrecision` instead.
    


[NeMo I 2024-10-15 07:07:51 dist_ckpt_io:320] Using ('zarr', 1) dist-ckpt save strategy.


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [14]:

# Fine-tune the model with the best hyperparameters
best_model = initialize_model(cfg, best_trainer)
best_model.cfg.optim.lr = study.best_params["learning_rate"]
best_model.cfg.optim.weight_decay = study.best_params["weight_decay"]
best_model.cfg.optim.sched.warmup_ratio = study.best_params["warmup_ratio"]


[NeMo I 2024-10-15 07:07:51 3748006741:2] 
    
    ************** Experiment configuration ***********
[NeMo I 2024-10-15 07:07:51 3748006741:3] 
    run:
      name: llama2_7b
      results_dir: /global/scratch/users/ksevegnani/nemo_test/out
      time_limit: 0-01:30:00
      dependency: singleton
    trainer:
      num_nodes: 1
      devices: 1
      accelerator: gpu
      precision: 16
      logger: false
      enable_checkpointing: false
      use_distributed_sampler: false
      max_epochs: 10
      max_steps: 100
      max_time: 05:23:30:00
      log_every_n_steps: 10
      val_check_interval: 0.5
      limit_val_batches: 32
      limit_test_batches: 50
      accumulate_grad_batches: 1
      gradient_clip_val: 1.0
    exp_manager:
      explicit_log_dir: ${run.results_dir}/results
      exp_dir: null
      name: megatron_llama
      create_wandb_logger: false
      wandb_logger_kwargs:
        project: nemo_llama_pretrain
        name: ${run.name}
      resume_if_exists: false
 

[NeMo W 2024-10-15 07:07:51 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:07:51 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:07:51 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: moe_extended_tp in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:07:51 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:07:51 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: deterministic_mode in its c

[NeMo I 2024-10-15 07:07:51 megatron_init:263] Rank 0 has data parallel group : [0]
[NeMo I 2024-10-15 07:07:51 megatron_init:269] Rank 0 has combined group of data parallel and context parallel : [0]
[NeMo I 2024-10-15 07:07:51 megatron_init:274] All data parallel group ranks with context parallel combined: [[0]]
[NeMo I 2024-10-15 07:07:51 megatron_init:277] Ranks 0 has data parallel rank: 0
[NeMo I 2024-10-15 07:07:51 megatron_init:285] Rank 0 has context parallel group: [0]
[NeMo I 2024-10-15 07:07:51 megatron_init:288] All context parallel group ranks: [[0]]
[NeMo I 2024-10-15 07:07:51 megatron_init:289] Ranks 0 has context parallel rank: 0
[NeMo I 2024-10-15 07:07:51 megatron_init:296] Rank 0 has model parallel group: [0]
[NeMo I 2024-10-15 07:07:51 megatron_init:297] All model parallel group ranks: [[0]]
[NeMo I 2024-10-15 07:07:51 megatron_init:306] Rank 0 has tensor model parallel group: [0]
[NeMo I 2024-10-15 07:07:51 megatron_init:310] All tensor model parallel group ranks: 

[NeMo W 2024-10-15 07:07:51 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:07:51 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:07:51 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: moe_extended_tp in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:07:51 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:07:51 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: deterministic_mode in its c

[NeMo I 2024-10-15 07:07:51 tokenizer_utils:188] Getting SentencePiece with model: /global/scratch/users/ksevegnani/nemo_test/llama-tokenizer.model
[NeMo I 2024-10-15 07:07:51 megatron_base_model:584] Padded vocab_size: 32128, original vocab_size: 32003, dummy tokens: 125.


[NeMo W 2024-10-15 07:07:51 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:07:51 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:07:51 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: moe_extended_tp in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:07:51 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:07:51 megatron_base_model:1154] The model: MegatronGPTModel() does not have field.name: deterministic_mode in its c

[NeMo W 2024-10-15 07:07:51 megatron_base_model:556] The model: MegatronGPTModel() does not have field.name: moe_input_jitter_eps in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:07:51 megatron_base_model:556] The model: MegatronGPTModel() does not have field.name: moe_token_dropping in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:07:51 megatron_base_model:556] The model: MegatronGPTModel() does not have field.name: moe_token_dispatcher_type in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:07:51 megatron_base_model:556] The model: MegatronGPTModel() does not have field.name: moe_per_layer_logging in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-10-15 07:07:51 megatron_base_model:556] The model: MegatronGPTModel() does not have field.name: moe_expert_capacity_factor in its c

[NeMo I 2024-10-15 07:07:51 build_model:143]  > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 469256192


In [15]:
cfg.exp_manager.exp_dir="best_model_experiment"
cfg.exp_manager.create_wandb_logger=False

best_trainer.logger=None
cfg.trainer.max_steps="null"
cfg.trainer.max_epochs=1
exp_manager(
    best_trainer,
    cfg.exp_manager
)

[NeMo I 2024-10-15 07:07:51 exp_manager:341] ExpManager schema
[NeMo I 2024-10-15 07:07:51 exp_manager:342] {'explicit_log_dir': None, 'exp_dir': None, 'name': None, 'version': None, 'use_datetime_version': True, 'resume_if_exists': False, 'resume_past_end': False, 'resume_ignore_no_checkpoint': False, 'resume_from_checkpoint': None, 'create_tensorboard_logger': True, 'summary_writer_kwargs': None, 'create_wandb_logger': False, 'wandb_logger_kwargs': None, 'create_mlflow_logger': False, 'mlflow_logger_kwargs': {'experiment_name': None, 'tracking_uri': None, 'tags': None, 'save_dir': './mlruns', 'prefix': '', 'artifact_location': None, 'run_id': None, 'log_model': False}, 'create_dllogger_logger': False, 'dllogger_logger_kwargs': {'verbose': False, 'stdout': False, 'json_file': './dllogger.json'}, 'create_clearml_logger': False, 'clearml_logger_kwargs': {'project': None, 'task': None, 'connect_pytorch': False, 'model_name': None, 'tags': None, 'log_model': False, 'log_cfg': False, 'log_

[NeMo E 2024-10-15 07:07:51 exp_manager:707] exp_manager received explicit_log_dir: /global/scratch/users/ksevegnani/nemo_test/out/results and at least one of exp_dir: best_model_experiment, or version: None. Please note that exp_dir, name, and version will be ignored.
[NeMo W 2024-10-15 07:07:51 exp_manager:712] Exp_manager is logging to /global/scratch/users/ksevegnani/nemo_test/out/results, but it already exists.


[NeMo I 2024-10-15 07:07:51 exp_manager:400] Experiments will be logged at /global/scratch/users/ksevegnani/nemo_test/out/results
[NeMo I 2024-10-15 07:07:51 exp_manager:860] TensorboardLogger has been set up


[NeMo W 2024-10-15 07:07:51 exp_manager:970] The checkpoint callback was told to monitor a validation value and trainer's max_steps was set to 100. Please ensure that max_steps will run for at least 1 epochs to ensure that checkpointing will not error out.


PosixPath('/global/scratch/users/ksevegnani/nemo_test/out/results')

In [16]:
best_trainer.fit(best_model)

[NeMo W 2024-10-15 07:07:52 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/configuration_validator.py:181: You have overridden `MegatronGPTModel.configure_sharded_model` which is deprecated. Please override the `configure_model` hook instead. Instantiation with the newer hook will be created on the device right away and have the right data type depending on the precision setting in the Trainer.
    
[NeMo W 2024-10-15 07:07:52 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/configuration_validator.py:163: You are using the `dataloader_iter` step flavor. If you consume the iterator more than once per step, the `batch_idx` argument in any hook that takes it will not match with the batch index of the last batch consumed. This might have unforeseen effects on callbacks or code that expects to get the correct index. This will also not work well with gradient accumulation. This feature is very experimental and subject t

[NeMo I 2024-10-15 07:07:53 megatron_gpt_model:1592] Pipeline model parallel rank: 0, Tensor model parallel rank: 0, Number of model parameters on device: 4.69e+08. Number of precise model parameters on device: 469256192.
[NeMo I 2024-10-15 07:07:53 megatron_gpt_model:1446] Building GPT datasets.
[NeMo I 2024-10-15 07:07:53 utils:47] Let split_matrix = [(0, 0.9801980198019802), (0.9801980198019802, 0.9900990099009901), (0.9900990099009901, 1.0)]
[NeMo I 2024-10-15 07:07:53 utils:47] Building dataset splits with cls=GPTDataset, sizes=[100, 6432.0, 50], and config=GPTDatasetConfig(random_seed=1234, sequence_length=4096, blend=(['/global/scratch/users/ksevegnani/nemo_test/pubmedqa_big_llama_input_document'], [1.0]), blend_per_split=None, split='99,1,1', split_matrix=[(0, 0.9801980198019802), (0.9801980198019802, 0.9900990099009901), (0.9900990099009901, 1.0)], num_dataset_builder_threads=1, path_to_cache=None, mmap_bin_files=True, mock=False, tokenizer=<nemo.collections.common.tokenizers.

[NeMo W 2024-10-15 07:07:53 utils:47] Building a BlendedDataset for a single MegatronDataset


[NeMo I 2024-10-15 07:07:53 utils:47] Build and save the BlendedDataset indices
[NeMo I 2024-10-15 07:07:53 utils:47] 	Build and save the dataset and dataset sample indexes


[NeMo W 2024-10-15 07:07:53 utils:47] Unable to save the BlendedDataset indexes because path_to_cache is None
[NeMo W 2024-10-15 07:07:53 utils:47] Building a BlendedDataset for a single MegatronDataset


[NeMo I 2024-10-15 07:07:53 utils:47] Build and save the BlendedDataset indices
[NeMo I 2024-10-15 07:07:53 utils:47] 	Build and save the dataset and dataset sample indexes


[NeMo W 2024-10-15 07:07:53 utils:47] Unable to save the BlendedDataset indexes because path_to_cache is None
[NeMo W 2024-10-15 07:07:53 utils:47] Building a BlendedDataset for a single MegatronDataset


[NeMo I 2024-10-15 07:07:53 utils:47] Build and save the BlendedDataset indices
[NeMo I 2024-10-15 07:07:53 utils:47] 	Build and save the dataset and dataset sample indexes


[NeMo W 2024-10-15 07:07:53 utils:47] Unable to save the BlendedDataset indexes because path_to_cache is None


[NeMo I 2024-10-15 07:07:53 megatron_gpt_model:1530] Length of train dataset: 101
[NeMo I 2024-10-15 07:07:53 megatron_gpt_model:1532] Length of val dataset: 6465
[NeMo I 2024-10-15 07:07:53 megatron_gpt_model:1534] Length of test dataset: 51
[NeMo I 2024-10-15 07:07:53 megatron_gpt_model:1535] Finished building GPT datasets.
[NeMo I 2024-10-15 07:07:53 megatron_gpt_model:1636] Setting up train dataloader with len(len(self._train_ds)): 101 and consumed samples: 0
[NeMo I 2024-10-15 07:07:53 megatron_gpt_model:1544] Building dataloader with consumed samples: 0
[NeMo I 2024-10-15 07:07:53 data_samplers:76] Instantiating MegatronPretrainingSampler with total_samples: 101 and consumed_samples: 0
[NeMo I 2024-10-15 07:07:53 megatron_gpt_model:1644] Setting up validation dataloader with len(len(self._validation_ds)): 6465 and consumed samples: 0
[NeMo I 2024-10-15 07:07:53 megatron_gpt_model:1544] Building dataloader with consumed samples: 0
[NeMo I 2024-10-15 07:07:53 data_samplers:76] Inst

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[NeMo W 2024-10-15 07:07:53 megatron_base_model:1195] Ignoring `trainer.max_epochs` when computing `max_steps` because `trainer.max_steps` is already set to 100.


[NeMo I 2024-10-15 07:07:53 modelPT:770] Optimizer config = MegatronDistributedFusedAdam (
    Parameter Group 0
        betas: [0.9, 0.95]
        bias_correction: True
        eps: 1e-08
        is_expert: False
        lr: 0.0003654503479685462
        weight_decay: 0.0009979655831599337
    )
[NeMo I 2024-10-15 07:07:53 lr_scheduler:923] Scheduler "<nemo.core.optim.lr_scheduler.CosineAnnealing object at 0x5189ffc3a5c0>" 
    will be used during training (effective maximum steps = 100) - 
    Parameters : 
    (warmup_ratio: 0.1911978919333179
    constant_steps: 0
    min_lr: 1.0e-05
    max_steps: 100
    )



  | Name  | Type          | Params
----------------------------------------
0 | model | Float16Module | 469 M 
----------------------------------------
469 M     Trainable params
0         Non-trainable params
469 M     Total params
1,877.025 Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

[NeMo W 2024-10-15 07:07:54 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/utilities.py:149: Found `dataloader_iter` argument in the `validation_step`. Note that the support for this signature is experimental and the behavior is subject to change.
    
[NeMo W 2024-10-15 07:08:01 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/utilities.py:149: Found `dataloader_iter` argument in the `training_step`. Note that the support for this signature is experimental and the behavior is subject to change.
    


Training: |                                                                                                   …

    
[NeMo W 2024-10-15 07:08:11 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/utilities.py:149: Found `dataloader_iter` argument in the `validation_step`. Note that the support for this signature is experimental and the behavior is subject to change.
    


Validation: |                                                                                                 …

Epoch 0, global step 50: 'val_loss' reached 7.68887 (best 7.68887), saving model to '/global/scratch/users/ksevegnani/nemo_test/out/results/checkpoints/megatron_llama--val_loss=7.69-step=50-consumed_samples=50.0.ckpt' as top 10


Validation: |                                                                                                 …

Epoch 0, global step 100: 'val_loss' reached 7.39861 (best 7.39861), saving model to '/global/scratch/users/ksevegnani/nemo_test/out/results/checkpoints/megatron_llama--val_loss=7.40-step=100-consumed_samples=100.0.ckpt' as top 10
`Trainer.fit` stopped: `max_steps=100` reached.


In [17]:

# Save the fine-tuned model
best_model.save_to("llama2-7b-finetuned-optuna.nemo")

[NeMo I 2024-10-15 07:08:42 dist_ckpt_io:320] Using ('zarr', 1) dist-ckpt save strategy.
