# Eagle 7B : Finetuning on various OSS instruct

The following showcases an example of Training the RWKV-v5 7B model, on enwiki and various instruct dataset

## Configure the env variable below
The default auto strategy, should work on a single 4090, scaling up all the way to 8xH100s

In [1]:
# -----------------------------------------------------------------
# Your configurable settings
# -----------------------------------------------------------------

# WANDB settings
ENABLE_WANDB=True
WANDB_PREFIX="RWKV-v5-Finetune"
WANDB_PROJECT="RWKV-v5-Finetune"

# Project directory offset (you need to modify if, you move the notebook into another dir)
PROJECT_DIR_OFFSET="../../"

# Config dir (relative to the notebook, excluding ending slash)
# to use, with the config filename
CONFIG_FILE_DIR="."
CONFIG_FILE_NAME="Eagle-x-zMultipack-Instruct"

# The model to use
MODEL_NAME="RWKV-v5-Eagle-World-7B-v2-20240128-ctx4096.pth"
MODEL_URL="https://huggingface.co/RWKV/v5-Eagle-7B/resolve/main/RWKV-v5-Eagle-World-7B-v2-20240128-ctx4096.pth?download=true"

# GPU count to use
GPU_DEVICES="auto"

# -----------------------------------------------------------------
# Lets detect the GPU vram sizes, and suggest a resonable default
# based on the detected VRAM sizes
# -----------------------------------------------------------------

# Default settings
# NOTE: If your not using cuda, you may want to manually change this around
DEEPSPEED_STRAT="deepspeed_stage_2"
TRAINING_CTX_LEN=2048
MICROBATCH_SIZE=1

import torch
if torch.cuda is None or not torch.cuda.is_available() or torch.cuda.device_count() <= 0:
    print("No CUDA compatible GPU found, using default settings")
else:
    # -----------------------------------------------------------------
    # Auto select the strategy based on the detected VRAM size
    # -----------------------------------------------------------------

    GPU_COUNT=torch.cuda.device_count()
    GPU_0_VRAM_SIZE_GB=torch.cuda.get_device_properties(0).total_memory / 1024**3
    if GPU_DEVICES != "auto":
        GPU_COUNT=int(GPU_DEVICES)
    print("GPU_COUNT:", GPU_COUNT)
    print("GPU_0_VRAM_SIZE (GB):", GPU_0_VRAM_SIZE_GB)

    if GPU_0_VRAM_SIZE_GB < 17:
        assert False, "For the Eagle-7B model, you need atleast 18GB vram"
    elif GPU_0_VRAM_SIZE_GB < 23:
        # This takes about 17.5GB vram on a single GPU
        # We DO NOT recommend training with ctx_len=128, as the training
        # quality will degrade noticably. But it will work!
        DEEPSPEED_STRAT="deepspeed_stage_2_offload"
        TRAINING_CTX_LEN=128
        MICROBATCH_SIZE=1
    elif GPU_0_VRAM_SIZE_GB < 25:
        # This takes about 21GB vram on a single GPU
        DEEPSPEED_STRAT="deepspeed_stage_2_offload"
        TRAINING_CTX_LEN=2048
        MICROBATCH_SIZE=2
    elif GPU_0_VRAM_SIZE_GB < 78:
        # This takes about 23GB vram on a single GPU
        DEEPSPEED_STRAT="deepspeed_stage_2"
        TRAINING_CTX_LEN=4096
        MICROBATCH_SIZE=2
        if GPU_COUNT >= 8:
            MICROBATCH_SIZE=4
    else:
        # This is now the 80GB vram class
        DEEPSPEED_STRAT="deepspeed_stage_2"
        TRAINING_CTX_LEN=4096
        MICROBATCH_SIZE=4
        if GPU_COUNT >= 8:
            MICROBATCH_SIZE=8

# -----------------------------------------------------------------
# # Training settings you can use to override the "auto" default above
# -----------------------------------------------------------------
# DEEPSPEED_STRAT="deepspeed_stage_1"
# TRAINING_CTX_LEN=4096
# MICROBATCH_SIZE=8

# ---
print("ENABLE_WANDB:", ENABLE_WANDB)
print("GPU_DEVICES:", GPU_DEVICES)
print("DEEPSPEED_STRAT:", DEEPSPEED_STRAT)
print("TRAINING_CTX_LEN:", TRAINING_CTX_LEN)
if ENABLE_WANDB:
    WANDB_MODE="online"
else:
    WANDB_MODE="disabled"

# Computing the notebook, and various paths
import os
NOTEBOOK_DIR=os.path.dirname(os.path.abspath("__file__"))
PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, PROJECT_DIR_OFFSET))
TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, "./RWKV-v5/"))
print("NOTEBOOK_DIR:", NOTEBOOK_DIR)
print("TRAINER_DIR:", TRAINER_DIR)
print("PROJECT_DIR:", PROJECT_DIR)

# Check if the directory exists
if not os.path.exists(TRAINER_DIR):
    raise Exception("The trainer directory does not exists. Did you move the notebook?")

GPU_COUNT: 8
GPU_0_VRAM_SIZE (GB): 79.10943603515625
ENABLE_WANDB: True
GPU_DEVICES: auto
DEEPSPEED_STRAT: deepspeed_stage_2
TRAINING_CTX_LEN: 4096
NOTEBOOK_DIR: /workspace/picocreator/RWKV-infctx-trainer/notebook/finetune-example
TRAINER_DIR: /workspace/picocreator/RWKV-infctx-trainer/RWKV-v5
PROJECT_DIR: /workspace/picocreator/RWKV-infctx-trainer


## Lets download the model

In [2]:
!cd "{PROJECT_DIR}" && mkdir -p "./model" && \
    cd "./model" && \
    wget -nc "{MODEL_URL}" -O "{MODEL_NAME}"

File ‘RWKV-v5-Eagle-World-7B-v2-20240128-ctx4096.pth’ already there; not retrieving.


## Build the dataset

In [5]:
# Lets build the giant datapack
!cd "{TRAINER_DIR}" && python3 datapack_build.py "{NOTEBOOK_DIR}/{CONFIG_FILE_DIR}/{CONFIG_FILE_NAME}-build.yaml"

>> Starting datapack build process for: /workspace/picocreator/RWKV-infctx-trainer/notebook/finetune-example/./Eagle-x-zMultipack-Instruct-build.yaml
>> Preparing dataset - index:  0  - name:  MedText
Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 4500.45 examples/
Saving the dataset (1/1 shards): 100%|██| 15/15 [00:00<00:00, 734.83 examples/s]
>> Preparing dataset - index:  1  - name:  ALMA-prompt-completion
Saving the dataset (1/1 shards): 100%|█| 376/376 [00:00<00:00, 1536.56 examples/
Saving the dataset (1/1 shards): 100%|█| 1175/1175 [00:00<00:00, 36713.48 exampl
>> Preparing dataset - index:  2  - name:  OpenOrca
Saving the dataset (20/20 shards): 100%|█| 48493/48493 [00:29<00:00, 1632.41 exa
Saving the dataset (1/1 shards): 100%|█| 42340/42340 [00:00<00:00, 50486.14 exam
>> Preparing dataset - index:  3  - name:  openhermes-1-instruct
Saving the dataset (1/1 shards): 100%|█| 2381/2381 [00:01<00:00, 1573.62 example
Saving the dataset (1/1 shards): 100%|█| 2429/242

## Do the initial validation run (for reference)

In [6]:
# Setup the checkpoint dir
!cd "{PROJECT_DIR}" && mkdir -p "./checkpoint/{CONFIG_FILE_NAME}/"

# Lets start the training
!cd "{TRAINER_DIR}" && \
    export RWKV_NO_CUDA=0 && \
    export RWKV_TORCH_COMPILE=0 && \
    export WANDB_MODE="{WANDB_MODE}" && \
    python3 lightning_trainer.py validate \
        -c "{NOTEBOOK_DIR}/{CONFIG_FILE_DIR}/{CONFIG_FILE_NAME}.yaml" \
        --model.load_model="../model/{MODEL_NAME}" \
        --data.skip_datapath_setup=True \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - {CONFIG_FILE_NAME} (tctxlen={TRAINING_CTX_LEN}, {DEEPSPEED_STRAT})" \
        --trainer.logger.init_args.project="{WANDB_PROJECT}" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.target_batch_size=1024 \
        --trainer.microbatch_size={MICROBATCH_SIZE} \
        --model.ctx_len={TRAINING_CTX_LEN} \
        --trainer.devices="{GPU_DEVICES}"

[2024-02-06 05:15:39,276] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV infctx using 'torch-jit' with torch '2.1.1+cu121'
/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['validate', '-c', '/workspace/picocreator/RWKV-infctx-trainer/notebook/finetune-example/./Eagle-x-zMultipack-Instruct.yaml', '--model.load_model=../model/RWKV-v5-Eagle-World-7B-v2-20240128-ctx4096.pth', '--data.skip_datapath_setup=True', '--trainer.logger.init_args.name=RWKV-v5-Finetune - Eagle-x-zMultipack-Instruct (tctxlen=4096, deepspeed_stage_2)', '--trainer.logger.init_args.project=RWKV-v5-Finetune', '--trainer.strategy=deepspeed_stage_2', '--trainer.target_batch_size=1024', '--trainer.microbatch_size=8', '

## Start the training run!

In [None]:
# Setup the checkpoint dir
!cd "{PROJECT_DIR}" && mkdir -p "./checkpoint/{CONFIG_FILE_NAME}/"

# Lets start the training
!cd "{TRAINER_DIR}" && \
    export RWKV_NO_CUDA=1 && \
    export RWKV_TORCH_COMPILE=0 && \
    export WANDB_MODE="{WANDB_MODE}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/{CONFIG_FILE_DIR}/{CONFIG_FILE_NAME}.yaml" \
        --model.load_model="../model/{MODEL_NAME}" \
        --data.skip_datapath_setup=True \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - {CONFIG_FILE_NAME} (tctxlen={TRAINING_CTX_LEN}, {DEEPSPEED_STRAT})" \
        --trainer.logger.init_args.project="{WANDB_PROJECT}" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.target_batch_size=1024 \
        --trainer.microbatch_size={MICROBATCH_SIZE} \
        --model.ctx_len={TRAINING_CTX_LEN} \
        --trainer.devices="{GPU_DEVICES}"

# For multi node training, you can add in the respective env variables
# adjusted to your exact multi node use cases (reminder: adjust target_batch_size as well)
# export MASTER_ADDR=10.130.0.24 && export MASTER_PORT=31856 && \
# export WORLD_SIZE=32 && export NODE_RANK=1 && \

[2024-02-06 22:08:40,160] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV infctx using 'torch-jit' with torch '2.1.1+cu121'
/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/workspace/picocreator/RWKV-infctx-trainer/notebook/finetune-example/./Eagle-x-zMultipack-Instruct.yaml', '--model.load_model=../model/RWKV-v5-Eagle-World-7B-v2-20240128-ctx4096.pth', '--data.skip_datapath_setup=True', '--trainer.logger.init_args.name=RWKV-v5-Finetune - Eagle-x-zMultipack-Instruct (tctxlen=4096, deepspeed_stage_2)', '--trainer.logger.init_args.project=RWKV-v5-Finetune', '--trainer.strategy=deepspeed_stage_2', '--trainer.target_batch_size=1024', '--trainer.microbatch_size=8', '--mod

## Export the model

In [3]:
# Lets export the model from the checkpoint
!cd "{TRAINER_DIR}" && \
    python export_checkpoint.py "../checkpoint/{CONFIG_FILE_NAME}/last.ckpt" "../model/{CONFIG_FILE_NAME}.pth"
!cd "{TRAINER_DIR}" && ls -alh "../model/{CONFIG_FILE_NAME}.pth"

[2024-02-06 06:13:47,590] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
Traceback (most recent call last):
  File "/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py", line 651, in <module>
    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, output_file, save_dtype=args.dtype)
  File "/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py", line 542, in convert_zero_checkpoint_to_fp32_state_dict
    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
  File "/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py", line 516, in get_fp32_state_dict_from_zero_checkpoint
    raise ValueError(f"Unable to find 'latest' file at {latest_path}")
ValueError: Unable to find 'latest' file at ../checkpoint/Eagle-x-zMultipack-Instruct/last.ckpt/latest
ls: cannot access '../model/Eagle-x-zMultipack-Instruct.pth': No such file or directory


## Sanity check (that the model actually output stuff)

In [4]:
# Lets do a quick dragon prompt validation
!cd "{TRAINER_DIR}" && \
    python3 dragon_test.py "../model/{CONFIG_FILE_NAME}.pth" "cuda bf16"