In [None]:
import optuna
import nemo
import nemo.collections.nlp as nemo_nlp
from nemo.utils.exp_manager import exp_manager
import torch.multiprocessing as mp
from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder

In [None]:
import os
import tempfile
import sys

In [None]:
from omegaconf.omegaconf import OmegaConf, open_dict
from nemo.utils import logging
from nemo.utils.exp_manager import exp_manager
from nemo.core.config import hydra_runner

In [None]:
from hydra import initialize, compose

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
mp.set_start_method("spawn", force=True)

In [None]:
def get_config():
    with initialize(version_base=None, config_path="config"):
        cfg = compose(config_name="llama2_7b_optuna.yaml")
    print(f'\n{OmegaConf.to_yaml(cfg)}')
    return cfg

In [None]:
def initialize_model(cfg, trainer):
    logging.info("\n\n************** Experiment configuration ***********")
    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
    
    # Continual training
    if cfg.model.get("restore_from_path") is not None:
        # Option 1: Restore only the model weights from a .nemo file
        logging.info(f"Continual training: loading weights from {cfg.model.restore_from_path}")
        model = MegatronGPTModel.restore_from(
            restore_path=cfg.model.restore_from_path,
            override_config_path=cfg.model,
            trainer=trainer,
            save_restore_connector=NLPSaveRestoreConnector(),
        )
    elif cfg.model.get("restore_from_ckpt") is not None:
        # Option 2: Restore both model weights and optimizer states from a PTL checkpoint
        logging.info(f"Continual training: loading weights and optimizer states from {cfg.model.restore_from_ckpt}")
        trainer.ckpt_path = Path(cfg.model.restore_from_ckpt)
        model = MegatronGPTModel(cfg.model, trainer)
    else:
        # Start new pretraining or resume from a checkpoint if it exists
        model = MegatronGPTModel(cfg.model, trainer)
    
    return model

In [None]:
# Define the objective function for Optuna
def objective(trial):
    cfg = get_config()
    
    # Suggest hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-5, 1e-2, log=True)
    warmup_ratio = trial.suggest_float("warmup_ratio", 0.0, 0.2)

    

    trainer = MegatronTrainerBuilder(cfg).create_trainer()
    exp_manager(trainer, cfg.exp_manager)

    # Load the pre-trained Llama 2 model
    model = initialize_model(cfg, trainer)

    # Configure the model with suggested hyperparameters
    model.cfg.optim.lr = learning_rate
    model.cfg.optim.weight_decay = weight_decay
    model.cfg.optim.sched.warmup_ratio = warmup_ratio


    # Fine-tune the model
    trainer.fit(model)

    # Return the validation loss as the objective value
    return trainer.callback_metrics['val_loss'].item()



In [None]:
# Create an Optuna study
study = optuna.create_study(direction="minimize")

In [None]:
# Optimize the hyperparameters
study.optimize(objective, n_trials=3, gc_after_trial=True)

In [None]:
# Print the best hyperparameters and corresponding validation loss
print("Best hyperparameters:", study.best_params)
print("Best validation loss:", study.best_value)

In [None]:
cfg = get_config()

# setting custom values
cfg.trainer.max_epochs = 10
cfg.trainer.devices = 1
cfg.trainer.precision = 16
cfg.trainer.accelerator = "gpu"
cfg.trainer.log_every_n_steps = 10
cfg.trainer.val_check_interval = 0.5


best_trainer = MegatronTrainerBuilder(cfg).create_trainer()

# best_trainer = pl.Trainer(
#     max_epochs=10,
#     gpus=1,
#     precision=16,
#     amp_level='O2',
#     accelerator="gpu",
#     strategy="ddp",
#     log_every_n_steps=10,
#     val_check_interval=0.5,
# )

In [None]:

# Fine-tune the model with the best hyperparameters
best_model = initialize_model(cfg, best_trainer)
best_model.cfg.optim.lr = study.best_params["learning_rate"]
best_model.cfg.optim.weight_decay = study.best_params["weight_decay"]
best_model.cfg.optim.sched.warmup_ratio = study.best_params["warmup_ratio"]


In [None]:
cfg.exp_manager.exp_dir="best_model_experiment"
cfg.exp_manager.create_wandb_logger=False

best_trainer.logger=None
cfg.trainer.max_steps="null"
cfg.trainer.max_epochs=1
exp_manager(
    best_trainer,
    cfg.exp_manager
)

In [None]:
best_trainer.fit(best_model)

In [None]:

# Save the fine-tuned model
best_model.save_to("llama2-7b-finetuned-optuna.nemo")