# Eagle 7B : Training on 1 more Trillion tokens!

In [1]:
# -----------------------------------------------------------------
# Your configurable settings
# -----------------------------------------------------------------

# WANDB settings
ENABLE_WANDB=True
WANDB_PREFIX="Eagle-2T"
WANDB_PROJECT="RWKV-x-Eagle-2T"

# Project directory offset (you need to modify if, you move the notebook into another dir)
PROJECT_DIR_OFFSET="../../../../"

# Config dir (relative to the notebook, excluding ending slash)
# to use, with the config filename
CONFIG_FILE_DIR="."
CONFIG_FILE_NAME="Eagle-2T-C01"

# GPU count to use
GPU_DEVICES="auto"

# -----------------------------------------------------------------
# # Training settings you can use to override the "auto" default above
# -----------------------------------------------------------------
DEEPSPEED_STRAT="deepspeed_stage_2"
TRAINING_CTX_LEN=4096
MICROBATCH_SIZE=8

# ---
print("ENABLE_WANDB:", ENABLE_WANDB)
print("GPU_DEVICES:", GPU_DEVICES)
print("DEEPSPEED_STRAT:", DEEPSPEED_STRAT)
print("TRAINING_CTX_LEN:", TRAINING_CTX_LEN)
if ENABLE_WANDB:
    WANDB_MODE="online"
else:
    WANDB_MODE="disabled"

# Computing the notebook, and various paths
import os
NOTEBOOK_DIR=os.path.dirname(os.path.abspath("__file__"))
PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, PROJECT_DIR_OFFSET))
TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, "./RWKV-v5/"))
print("NOTEBOOK_DIR:", NOTEBOOK_DIR)
print("TRAINER_DIR:", TRAINER_DIR)
print("PROJECT_DIR:", PROJECT_DIR)

# Check if the directory exists
if not os.path.exists(TRAINER_DIR):
    raise Exception("The trainer directory does not exists. Did you move the notebook?")

ENABLE_WANDB: True
GPU_DEVICES: auto
DEEPSPEED_STRAT: deepspeed_stage_2
TRAINING_CTX_LEN: 4096
NOTEBOOK_DIR: /workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-run/chunk-1
TRAINER_DIR: /workspace/picocreator/RWKV-infctx-trainer/RWKV-v5
PROJECT_DIR: /workspace/picocreator/RWKV-infctx-trainer


## Start the training run!

In [None]:
# Setup the checkpoint dir
!cd "{PROJECT_DIR}" && mkdir -p "./checkpoint/{CONFIG_FILE_NAME}/"

# Lets start the training
!cd "{TRAINER_DIR}" && \
    export MASTER_PORT=31856 && \
    export WORLD_SIZE=32 && export NODE_RANK=0 && \
    export RWKV_NO_CUDA=1 && \
    export RWKV_TORCH_COMPILE=0 && \
    export WANDB_MODE="{WANDB_MODE}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/{CONFIG_FILE_DIR}/{CONFIG_FILE_NAME}.yaml" \
        --data.skip_datapath_setup=True \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - {CONFIG_FILE_NAME} (tctxlen={TRAINING_CTX_LEN}, {DEEPSPEED_STRAT})" \
        --trainer.logger.init_args.project="{WANDB_PROJECT}" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.target_batch_size=1024 \
        --trainer.microbatch_size={MICROBATCH_SIZE} \
        --model.ctx_len={TRAINING_CTX_LEN} \
        --trainer.devices="{GPU_DEVICES}"


[2024-02-08 07:56:19,958] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV infctx using 'torch-jit' with torch '2.1.1+cu121'
/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-run/chunk-1/./Eagle-2T-C01.yaml', '--data.skip_datapath_setup=True', '--trainer.logger.init_args.name=Eagle-2T - Eagle-2T-C01 (tctxlen=4096, deepspeed_stage_2)', '--trainer.logger.init_args.project=RWKV-x-Eagle-2T', '--trainer.strategy=deepspeed_stage_2', '--trainer.target_batch_size=1024', '--trainer.microbatch_size=8', '--model.ctx_len=4096', '--trainer.devices=auto'], args=['fit', '-c', '/workspace/picocreator/RWKV-infctx-tr

## Export the model

In [None]:
# # Lets export the model from the checkpoint
# !cd "{TRAINER_DIR}" && \
#     python export_checkpoint.py "../checkpoint/{CONFIG_FILE_NAME}/last.ckpt" "../model/{CONFIG_FILE_NAME}.pth"
# !cd "{TRAINER_DIR}" && ls -alh "../model/{CONFIG_FILE_NAME}.pth"

## Sanity check (that the model actually output stuff)

In [None]:
# # Lets do a quick dragon prompt validation
# !cd "{TRAINER_DIR}" && \
#     python3 dragon_test.py "../model/{CONFIG_FILE_NAME}.pth" "cuda bf16"