# RWKV v5-wavenet 1B5 / embedding init-range 1e-01 / 16k
This model is based on the RWKV standard 1B5 model

- 24 layers
- 2048 embedding size

Going through the modified memory training for v5 models, across various initial embedding model weights

**Note:** This project assumes you have the rwkv-infctx conda env setup

# Basic Setup

In [1]:
# First lets setup the various directories, and init the model
!mkdir -p ../../../../model/
!mkdir -p ../../../../datapath/
!mkdir -p ../../../../checkpoint/

In [2]:
# Additional dependencies for eval stuff
!pip install -q aiocsv aiofiles

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m


In [3]:
DEEPSPEED_STRAT="deepspeed_stage_1"
GPU_DEVICES="auto"
ENABLE_WANDB=True
EMBED_SCALE=0.1

WANDB_PREFIX=f"v5wave-1B5-{EMBED_SCALE}"

EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(".", "_")
FILENAME_PREFIX=f"v5wave-1B5-E{EMBED_SCALE_LABEL}"

# WAVENET LAYERS settings
RWKV_WAVENET_LAYERS=13

print("DEEPSPEED_STRAT:", DEEPSPEED_STRAT)
print("ENABLE_WANDB:", ENABLE_WANDB)
print("GPU_DEVICES:", GPU_DEVICES)
print("RWKV_WAVENET_LAYERS:", RWKV_WAVENET_LAYERS)

if ENABLE_WANDB:
    WANDB_MODE="online"
else:
    WANDB_MODE="disabled"

# Computing the notebook, and various paths
import os
NOTEBOOK_DIR=os.path.dirname(os.path.abspath("__file__"))
PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, "../../../../"))
TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, "./RWKV-v5wavenet/"))
INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, "./RWKV-v5wavenet/"))

print("NOTEBOOK_DIR:", NOTEBOOK_DIR)
print("INFERENCE_DIR:", INFERENCE_DIR)
print("TRAINER_DIR:", TRAINER_DIR)
print("PROJECT_DIR:", PROJECT_DIR)

DEEPSPEED_STRAT: deepspeed_stage_1
ENABLE_WANDB: True
GPU_DEVICES: auto
RWKV_WAVENET_LAYERS: 13
NOTEBOOK_DIR: /root/rwkv-x-playground/notebook/experiment/rwkv-x-exp/v5-wave-memory
INFERENCE_DIR: /root/rwkv-x-playground/RWKV-v5wavenet
TRAINER_DIR: /root/rwkv-x-playground/RWKV-v5wavenet
PROJECT_DIR: /root/rwkv-x-playground


In [4]:
# Init the model
!cd "{TRAINER_DIR}" && \
    export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
    python3 ./init_model.py \
        --n_layer 24 --n_embd 2048 \
        --emb-scale "{EMBED_SCALE}" \
        --vocab_size neox --skip-if-exists \
        "../model/L24-D2048-E{EMBED_SCALE_LABEL}-neox-v5wave-init.pth"

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
---- Initializing model ----
No of layers: 24
Embedding size: 2048
Output model path: ../model/L24-D2048-E0_1-neox-v5wave-init.pth
Vocab size: 50277
---- ----- ----
Model exists, skipping init_model


## Enwiki Stage 1 : Foundation 4k model training

In [5]:
# Lets preload the requried dataset 
!cd "{TRAINER_DIR}" && \
    python3 preload_datapath.py "{NOTEBOOK_DIR}/v5wave-1B5-enwiki-4k.yaml"

Found cached dataset parquet (/root/.cache/huggingface/datasets/teven___parquet/teven--enwiki_100k-1359e81b212c2dd6/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 71.87it/s]
Loading cached processed dataset at /root/.cache/huggingface/datasets/teven___parquet/teven--enwiki_100k-1359e81b212c2dd6/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7/cache-1838a89ade08598f_*_of_00064.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/teven___parquet/teven--enwiki_100k-1359e81b212c2dd6/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7/cache-4ed06743150889b3_*_of_00064.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/teven___parquet/teven--enwiki_100k-1359e81b212c2dd6/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7/cache-8447c42bccdba86a_*_of_00064.arrow
Loading cached split indices for

In [6]:
# Start the foundation model training
!cd "{TRAINER_DIR}" && \
    export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    python lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/v5wave-1B5-enwiki-4k.yaml" \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - Enwiki-4k Foundation (train-ctx=4k, {DEEPSPEED_STRAT})" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/{FILENAME_PREFIX}-enwiki-4k/" \
        --model.load_model="../model/L24-D2048-E{EMBED_SCALE_LABEL}-neox-v5wave-init.pth" \
        --model.ctx_len=4096 \
        --model.bptt_learning_range=1

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
  rank_zero_warn(
  rank_zero_warn(f"No seed found, seed set to {seed}")
Global seed set to 2617883477
[34m[1mwandb[0m: Currently logged in as: [33mpicocreator[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.15.8 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.15.4
[34m[1mwandb[0m: Run data is saved locally in [35m[1m./wandb/run-20230815_091635-36viply8[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mv5wave-1B5-0.1 - Enwiki-4k Foundation (train-ctx=4k, deepspeed_stage_1)[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/picocreator/RWKV-5X-Experiments[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/picocreator/RWKV-5X-Experi

In [7]:
# Lets export the model from the checkpoint
!cd "{TRAINER_DIR}" && \
    python export_checkpoint.py "../checkpoint/{FILENAME_PREFIX}-enwiki-4k/last.ckpt" "../model/{FILENAME_PREFIX}-enwiki-4k.pth"
!cd "{TRAINER_DIR}" && ls -alh "../model/{FILENAME_PREFIX}-enwiki-4k.pth"

Setting ds_accelerator to cuda (auto detect)
Processing zero checkpoint '../checkpoint/v5wave-1B5-E0_1-enwiki-4k/last.ckpt/checkpoint'
Detected checkpoint of type zero stage ZeroStageEnum.optimizer_states, world_size: 8
Parsing checkpoint created by deepspeed==0.9.3
Reconstructed fp32 state dict with 486 params 1515107840 elements
Saving fp32 state dict to ../model/v5wave-1B5-E0_1-enwiki-4k.pth
-rw-r--r-- 1 root root 5.7G Aug 15 17:00 ../model/v5wave-1B5-E0_1-enwiki-4k.pth


In [8]:
# # Lets do a quick dragon prompt validation
!cd "{INFERENCE_DIR}" && \
    export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
    python3 dragon_test.py "../model/{FILENAME_PREFIX}-enwiki-4k.pth" "cuda fp32"

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
--- DRAGON PROMPT ---
In a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese. The murder of Montrippō, who were German American king and relatives of the organization, who declined for work in the capital of India.

As of July 18, 2007, he was married to Egypt as he had previously made the women's influence of his poems. But if he would try to bring him to be acting at that time in court. While this change a year later he managed to prepare, while his actions у realized that it had "never come on my story he as me.

In 2009, Numi's father as his father's one solo officer, an old female son of Mayima and his colleagues. On his stage show The Battle of Tri, it was written in London and American articles, including

In [9]:
# Lets do a quick memory test
!export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
        python3 ../memory_script/eval_v5wavenet_memory_guided.py "{PROJECT_DIR}/model/{FILENAME_PREFIX}-enwiki-4k.pth"

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
###
### Model validation start ###
###
## Model validation for 5 tokens : 0.0% similarity, with 0 matched token, and 5 token mismatch
## Model validation for 10 tokens : 0.0% similarity, with 0 matched token, and 10 token mismatch
## Model validation for 15 tokens : 0.0% similarity, with 0 matched token, and 15 token mismatch
## Model validation for 20 tokens : 0.0% similarity, with 0 matched token, and 20 token mismatch
## Model validation for 25 tokens : 0.0% similarity, with 0 matched token, and 25 token mismatch
## Model validation for 30 tokens : 0.0% similarity, with 0 matched token, and 30 token mismatch
## Model validation for 35 tokens : 0.0% similarity, with 0 matched token, and 35 token mismatch
## Model validation for 40 tokens : 0.0% similarity, with 0 matched token, and 40 token mismatch
## Model validation for 45 tokens : 0.0% similarity, with 0 matched

## Enwiki Stage 1 : Foundation 16k model training

In [10]:
# Lets preload the requried dataset 
!cd "{TRAINER_DIR}" && \
    python3 preload_datapath.py "{NOTEBOOK_DIR}/v5wave-1B5-enwiki-16k.yaml"

Found cached dataset parquet (/root/.cache/huggingface/datasets/teven___parquet/teven--enwiki_100k-1359e81b212c2dd6/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 75.05it/s]
Loading cached processed dataset at /root/.cache/huggingface/datasets/teven___parquet/teven--enwiki_100k-1359e81b212c2dd6/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7/cache-8c145c390c889a8f_*_of_00064.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/teven___parquet/teven--enwiki_100k-1359e81b212c2dd6/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7/cache-6ea97834c464ff4e_*_of_00064.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/teven___parquet/teven--enwiki_100k-1359e81b212c2dd6/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7/cache-50665dd4de80b503_*_of_00064.arrow
Loading cached split indices for

In [11]:
# Start the foundation model training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
    python lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/v5wave-1B5-enwiki-16k.yaml" \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - Enwiki-16k Foundation (train-ctx=4k, {DEEPSPEED_STRAT})" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/{FILENAME_PREFIX}-enwiki-16k/" \
        --model.load_model="../model/{FILENAME_PREFIX}-enwiki-4k.pth" \
        --model.ctx_len=4096 \
        --model.bptt_learning_range=4

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
  rank_zero_warn(
  rank_zero_warn(f"No seed found, seed set to {seed}")
Global seed set to 1634498990
[34m[1mwandb[0m: Currently logged in as: [33mpicocreator[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.15.8 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.15.4
[34m[1mwandb[0m: Run data is saved locally in [35m[1m./wandb/run-20230815_170207-z6v6h227[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mv5wave-1B5-0.1 - Enwiki-16k Foundation (train-ctx=4k, deepspeed_stage_1)[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/picocreator/RWKV-5X-Experiments[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/picocreator/RWKV-5X-Exper

In [12]:
# Lets export the model from the checkpoint
!cd "{TRAINER_DIR}" && \
    python export_checkpoint.py "../checkpoint/{FILENAME_PREFIX}-enwiki-16k/last.ckpt" "../model/{FILENAME_PREFIX}-enwiki-16k.pth"
!cd "{TRAINER_DIR}" && ls -alh "../model/{FILENAME_PREFIX}-enwiki-16k.pth"

Setting ds_accelerator to cuda (auto detect)
Processing zero checkpoint '../checkpoint/v5wave-1B5-E0_1-enwiki-16k/last.ckpt/checkpoint'
Detected checkpoint of type zero stage ZeroStageEnum.optimizer_states, world_size: 8
Parsing checkpoint created by deepspeed==0.9.3
Reconstructed fp32 state dict with 486 params 1515107840 elements
Saving fp32 state dict to ../model/v5wave-1B5-E0_1-enwiki-16k.pth
-rw-r--r-- 1 root root 5.7G Aug 15 23:35 ../model/v5wave-1B5-E0_1-enwiki-16k.pth


In [13]:
# # Lets do a quick dragon prompt validation
!cd "{INFERENCE_DIR}" && \
    export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
    python3 dragon_test.py "../model/{FILENAME_PREFIX}-enwiki-16k.pth" "cuda fp32"

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
--- DRAGON PROMPT ---
In a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese. Aware that the rings in the nest of an ancient people had just one cub in the middle of the temple, and this thought was clear from this assumption, which was later disputed by a Chinese legend.

His work and his experiments in the cave's garden were difficult. His reputation was carefully promoted. The Buddha began using two different methods, the first being the "brother of the Buddha" (Chinese: ����; pinyin: p�� ���; pinyin: m��in-ào). The Buddha would have to travel to the temples of the Buddha, while the Buddha would keep a bird on his other hand, but he also needed to cultivate the Buddha's tail and horns to enter into the cave o

In [14]:
# Lets do a quick memory test
!export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
        python3 ../memory_script/eval_v5wavenet_memory_guided.py "{PROJECT_DIR}/model/{FILENAME_PREFIX}-enwiki-16k.pth"

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
###
### Model validation start ###
###
## Model validation for 5 tokens : 20.0% similarity, with 1 matched token, and 4 token mismatch
## Model validation for 10 tokens : 10.0% similarity, with 1 matched token, and 9 token mismatch
## Model validation for 15 tokens : 6.666666666666667% similarity, with 1 matched token, and 14 token mismatch
## Model validation for 20 tokens : 10.0% similarity, with 2 matched token, and 18 token mismatch
## Model validation for 25 tokens : 8.0% similarity, with 2 matched token, and 23 token mismatch
## Model validation for 30 tokens : 3.3333333333333335% similarity, with 1 matched token, and 29 token mismatch
## Model validation for 35 tokens : 2.857142857142857% similarity, with 1 matched token, and 34 token mismatch
## Model validation for 40 tokens : 2.5% similarity, with 1 matched token, and 39 token mismatch
## Model validation fo

# Enwiki Stage 2 : Basic Instruct Tuning

In [15]:
# Lets preload the requried dataset
!cd "{TRAINER_DIR}" && \
    python3 preload_datapath.py "{NOTEBOOK_DIR}/v5wave-1B5-enwiki-instruct.yaml"

Found cached dataset parquet (/root/.cache/huggingface/datasets/c-s-ale___parquet/c-s-ale--dolly-15k-instruction-alpaca-format-9dfbb23260d63d9d/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 63.78it/s]
Loading cached processed dataset at /root/.cache/huggingface/datasets/c-s-ale___parquet/c-s-ale--dolly-15k-instruction-alpaca-format-9dfbb23260d63d9d/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7/cache-e4df40d582f09838_*_of_00064.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/c-s-ale___parquet/c-s-ale--dolly-15k-instruction-alpaca-format-9dfbb23260d63d9d/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7/cache-6d5405ad1f265e84_*_of_00064.arrow
Loading cached split indices for dataset at /root/.cache/huggingface/datasets/c-s-ale___parquet/c-s-ale--dolly-15k-instruction-alpaca-format-9dfbb23260d63d9d/0.0.0/14a00e99c0d15a236

In [16]:
# Start the instruct finetuning
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
    python lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/v5wave-1B5-enwiki-instruct.yaml" \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - Enwiki-Instruct (train-ctx=4k, {DEEPSPEED_STRAT})" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/{FILENAME_PREFIX}-enwiki-instruct/" \
        --model.load_model="../model/{FILENAME_PREFIX}-enwiki-16k.pth" \
        --model.ctx_len=4096 \
        --model.bptt_learning_range=1

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
  rank_zero_warn(
  rank_zero_warn(f"No seed found, seed set to {seed}")
Global seed set to 2624976476
[34m[1mwandb[0m: Currently logged in as: [33mpicocreator[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.15.8 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.15.4
[34m[1mwandb[0m: Run data is saved locally in [35m[1m./wandb/run-20230815_233711-5fw9xtx3[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mv5wave-1B5-0.1 - Enwiki-Instruct (train-ctx=4k, deepspeed_stage_1)[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/picocreator/RWKV-5X-Experiments[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/picocreator/RWKV-5X-Experiments

In [44]:
# Lets export the model from the checkpoint
!cd "{TRAINER_DIR}" && \
    python export_checkpoint.py "../checkpoint/{FILENAME_PREFIX}-enwiki-instruct/last.ckpt" "../model/{FILENAME_PREFIX}-enwiki-instruct.pth"
!cd "{TRAINER_DIR}" && ls -alh "../model/{FILENAME_PREFIX}-enwiki-instruct.pth"

Setting ds_accelerator to cuda (auto detect)
Processing zero checkpoint '../checkpoint/v5wave-1B5-E0_1-enwiki-instruct/last.ckpt/checkpoint'
Detected checkpoint of type zero stage ZeroStageEnum.optimizer_states, world_size: 8
Parsing checkpoint created by deepspeed==0.9.3
Reconstructed fp32 state dict with 486 params 1515107840 elements
Saving fp32 state dict to ../model/v5wave-1B5-E0_1-enwiki-instruct.pth
-rw-r--r-- 1 root root 5.7G Aug 16 01:48 ../model/v5wave-1B5-E0_1-enwiki-instruct.pth


In [45]:
# # Lets do a quick dragon prompt validation
!cd "{INFERENCE_DIR}" && \
    export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
    python3 dragon_test.py "../model/{FILENAME_PREFIX}-enwiki-instruct.pth" "cuda fp32"

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
--- DRAGON PROMPT ---
In a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese. This makes the yMushaya belong to other planets in the Yakao mountains. The greater the "mushoy," a half of billions of years ago it was not about 2,000 years ago when the earth was once discovered and only after the elephants were not observed. While the conditions had also been reached and the planet was most impacted by the Earth's collapse, scientists were able to reach the earth. They were very different from the sun. There is no one before the Earth's circumference. The Neolithic world might be thought of as a shining earth, and a force would have been called the Sun. They are also likely to have been completely completed, so it 

In [46]:
# Lets do a quick memory test
!export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
        python3 ../memory_script/eval_v5wavenet_memory_guided.py "{PROJECT_DIR}/model/{FILENAME_PREFIX}-enwiki-instruct.pth"

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
###
### Model validation start ###
###
## Model validation for 5 tokens : 0.0% similarity, with 0 matched token, and 5 token mismatch
## Model validation for 10 tokens : 0.0% similarity, with 0 matched token, and 10 token mismatch
## Model validation for 15 tokens : 0.0% similarity, with 0 matched token, and 15 token mismatch
## Model validation for 20 tokens : 0.0% similarity, with 0 matched token, and 20 token mismatch
## Model validation for 25 tokens : 0.0% similarity, with 0 matched token, and 25 token mismatch
## Model validation for 30 tokens : 0.0% similarity, with 0 matched token, and 30 token mismatch
## Model validation for 35 tokens : 0.0% similarity, with 0 matched token, and 35 token mismatch
## Model validation for 40 tokens : 0.0% similarity, with 0 matched token, and 40 token mismatch
## Model validation for 45 tokens : 0.0% similarity, with 0 matched

## Tune 1 : Simple Memory instruct finetuning

- Tune 1: Low ctx size (512), Training with only the input masked. This does very limited memory training, and is used primarily to train the instruction set.

In [47]:
%%script bash

########################################
# Generate the required jsonl dataset
########################################

# Reset the dataset dir
mkdir -p ../dataset
rm -rf ../dataset/*.jsonl

# Generate the various datasets
echo "## Generating word reptition dataset ##"

# We do a strong bias for smaller word count, to teach the concept from scratch
# so that the model can learn the function. 
#
# Note that all document samples, are randomized between the target word count, 
# to half of the target word count.
python ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-2-count.jsonl  2  5000 &
python ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-5-count.jsonl  5  5000 &
python ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-10-count.jsonl 10 2500 &
python ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-15-count.jsonl 15 2500 &
python ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-20-count.jsonl 20 2500 &
python ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-25-count.jsonl 25 2500 &
python ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-40-count.jsonl 40 2500 &
python ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-50-count.jsonl 50 2500 &
python ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-60-count.jsonl 80 2500 &
python ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-80-count.jsonl 80 2500 &

# With a slight mix of the larger word count
python ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-100-count.jsonl 100 2500 &
python ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-200-count.jsonl 200 2500 &

wait
echo "## Done ##"

ls -alh ../dataset/

## Generating word reptition dataset ##
Generated JSONL file with - 10 max words, 2500 samples - at ../dataset/word-10-count.jsonl
Generated JSONL file with - 15 max words, 2500 samples - at ../dataset/word-15-count.jsonl
Generated JSONL file with - 20 max words, 2500 samples - at ../dataset/word-20-count.jsonl
Generated JSONL file with - 2 max words, 5000 samples - at ../dataset/word-2-count.jsonl
Generated JSONL file with - 25 max words, 2500 samples - at ../dataset/word-25-count.jsonl
Generated JSONL file with - 5 max words, 5000 samples - at ../dataset/word-5-count.jsonl
Generated JSONL file with - 40 max words, 2500 samples - at ../dataset/word-40-count.jsonl
Generated JSONL file with - 50 max words, 2500 samples - at ../dataset/word-50-count.jsonl
Generated JSONL file with - 80 max words, 2500 samples - at ../dataset/word-80-count.jsonl
Generated JSONL file with - 80 max words, 2500 samples - at ../dataset/word-60-count.jsonl
Generated JSONL file with - 100 max words, 2500 sample

In [48]:
# Start the finetune model training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
    python lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/v5wave-1B5-mem-instruct.yaml" \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - Mem-Instruct (train-ctx=512, {DEEPSPEED_STRAT})" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/{FILENAME_PREFIX}-mem-instruct/" \
        --model.load_model="../model/{FILENAME_PREFIX}-enwiki-instruct.pth" \
        --model.ctx_len=512 \
        --model.bptt_learning_range=1

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
  rank_zero_warn(
  rank_zero_warn(f"No seed found, seed set to {seed}")
Global seed set to 2124796021
[34m[1mwandb[0m: Currently logged in as: [33mpicocreator[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.15.8 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.15.4
[34m[1mwandb[0m: Run data is saved locally in [35m[1m./wandb/run-20230816_015109-a929z138[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mv5wave-1B5-0.1 - Mem-Instruct (train-ctx=512, deepspeed_stage_1)[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/picocreator/RWKV-5X-Experiments[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/picocreator/RWKV-5X-Experiments/r

In [49]:
# Lets export the model from the checkpoint
!cd "{TRAINER_DIR}" && \
    python export_checkpoint.py \
        "../checkpoint/{FILENAME_PREFIX}-mem-instruct/last.ckpt" \
        "../model/{FILENAME_PREFIX}-mem-instruct.pth"
!cd "{TRAINER_DIR}" && ls -alh "../model/{FILENAME_PREFIX}-mem-instruct.pth"

Setting ds_accelerator to cuda (auto detect)
Processing zero checkpoint '../checkpoint/v5wave-1B5-E0_1-mem-instruct/last.ckpt/checkpoint'
Detected checkpoint of type zero stage ZeroStageEnum.optimizer_states, world_size: 8
Parsing checkpoint created by deepspeed==0.9.3
Reconstructed fp32 state dict with 486 params 1515107840 elements
Saving fp32 state dict to ../model/v5wave-1B5-E0_1-mem-instruct.pth
-rw-r--r-- 1 root root 5.7G Aug 16 02:15 ../model/v5wave-1B5-E0_1-mem-instruct.pth


In [50]:
# Lets do a quick memory test
!export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
        python3 ../memory_script/eval_v5wavenet_memory_guided.py "{PROJECT_DIR}/model/{FILENAME_PREFIX}-mem-instruct.pth"

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
###
### Model validation start ###
###
## Model validation for 5 tokens : 20.0% similarity, with 1 matched token, and 4 token mismatch
## Model validation for 10 tokens : 10.0% similarity, with 1 matched token, and 9 token mismatch
## Model validation for 15 tokens : 6.666666666666667% similarity, with 1 matched token, and 14 token mismatch
## Model validation for 20 tokens : 5.0% similarity, with 1 matched token, and 19 token mismatch
## Model validation for 25 tokens : 4.0% similarity, with 1 matched token, and 24 token mismatch
## Model validation for 30 tokens : 3.3333333333333335% similarity, with 1 matched token, and 29 token mismatch
## Model validation for 35 tokens : 2.857142857142857% similarity, with 1 matched token, and 34 token mismatch
## Model validation for 40 tokens : 2.5% similarity, with 1 matched token, and 39 token mismatch
## Model validation for

## Tune 2 : Low ctx size (512), memory training

- Tune 2: Low ctx size (512), Training with instruction & input masked. This forces the actual memory training on the output tokens.

In [51]:
%%script bash

########################################
# Generate the required jsonl dataset
########################################

# Reset the dataset dir
mkdir -p ../dataset
rm -rf ../dataset/*.jsonl

# Generate the various datasets
echo "## Generating word reptition dataset ##"

#
# We switch over to fully masked instruct+input, to properly learn the memorization task
#
python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/word-2-count.jsonl  2  5000 &
for i in {5..95..5} 
do
    python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/gen-word-$i-count.jsonl $i 5000 & 
done
python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/word-100-count.jsonl 100 5000 &
python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/word-200-count.jsonl 200 5000 &

#
# We mixin the shuffled word list, so that we ensure all words / tokens are learned
# however this might intrduce an exclusion bias (if seen this word, never repeat it), 
# so we limit the mixture of this data samples
#
python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-10-count.jsonl 10 20 &
python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-15-count.jsonl 15 20 &
python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-25-count.jsonl 25 30 &
python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-50-count.jsonl 50 50 &
python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-75-count.jsonl 75 50 &
python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-100-count.jsonl 100 50 &
python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-200-count.jsonl 200 50 &

wait
echo "## Done ##"

ls -alh ../dataset/

## Generating word reptition dataset ##
Generated JSONL file with - 2 max words, 5000 samples - at ../dataset/word-2-count.jsonl
Generated JSONL file with - 5 max words, 5000 samples - at ../dataset/gen-word-5-count.jsonl
Generated a single JSONL file with 3564 samples (20 token repeat) - 15 max words - at ../dataset/shuffle-word-15-count.jsonl
Generated JSONL file with - 15 max words, 5000 samples - at ../dataset/gen-word-15-count.jsonl
Generated a single JSONL file with 3184 samples (30 token repeat) - 25 max words - at ../dataset/shuffle-word-25-count.jsonl
Generated a single JSONL file with 677 samples (50 token repeat) - 200 max words - at ../dataset/shuffle-word-200-count.jsonl
Generated a single JSONL file with 1322 samples (50 token repeat) - 100 max words - at ../dataset/shuffle-word-100-count.jsonl
Generated a single JSONL file with 5209 samples (20 token repeat) - 10 max words - at ../dataset/shuffle-word-10-count.jsonl
Generated a single JSONL file with 1770 samples (50 tok

In [52]:
# Start the finetune model training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
    python lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/v5wave-1B5-mem-template.yaml" \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - Mem-Tune ctx-512 (train-ctx=512, {DEEPSPEED_STRAT})" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}"  \
        --trainer.callbacks.init_args.dirpath="../checkpoint/{FILENAME_PREFIX}-mem-ctx-512/" \
        --model.lr_init=5e-4 \
        --model.lr_final=4e-4 \
        --data.max_token_size=512 \
        --model.ctx_len=512 \
        --model.bptt_learning_range=1 \
        --model.load_model="../model/{FILENAME_PREFIX}-mem-instruct.pth"

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
  rank_zero_warn(
  rank_zero_warn(f"No seed found, seed set to {seed}")
Global seed set to 3074262678
[34m[1mwandb[0m: Currently logged in as: [33mpicocreator[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.15.8 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.15.4
[34m[1mwandb[0m: Run data is saved locally in [35m[1m./wandb/run-20230816_021557-nnk974ui[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mv5wave-1B5-0.1 - Mem-Tune ctx-512 (train-ctx=512, deepspeed_stage_1)[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/picocreator/RWKV-5X-Experiments[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/picocreator/RWKV-5X-Experimen

In [53]:
# Lets export the model from the checkpoint
!cd "{TRAINER_DIR}" && \
    python export_checkpoint.py \
        "../checkpoint/{FILENAME_PREFIX}-mem-ctx-512/last.ckpt" \
        "../model/{FILENAME_PREFIX}-mem-ctx-512.pth"
!cd "{TRAINER_DIR}" && ls -alh "../model/{FILENAME_PREFIX}-mem-ctx-512.pth"

Setting ds_accelerator to cuda (auto detect)
Processing zero checkpoint '../checkpoint/v5wave-1B5-E0_1-mem-ctx-512/last.ckpt/checkpoint'
Detected checkpoint of type zero stage ZeroStageEnum.optimizer_states, world_size: 8
Parsing checkpoint created by deepspeed==0.9.3
Reconstructed fp32 state dict with 486 params 1515107840 elements
Saving fp32 state dict to ../model/v5wave-1B5-E0_1-mem-ctx-512.pth
-rw-r--r-- 1 root root 5.7G Aug 16 03:38 ../model/v5wave-1B5-E0_1-mem-ctx-512.pth


In [54]:
# Lets do a quick memory test
!export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
        python3 ../memory_script/eval_v5wavenet_memory_guided.py "{PROJECT_DIR}/model/{FILENAME_PREFIX}-mem-ctx-512.pth"

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
###
### Model validation start ###
###
## Model validation for 5 tokens : 100.0% similarity, with 5 matched token, and 0 token mismatch
## Model validation for 10 tokens : 100.0% similarity, with 10 matched token, and 0 token mismatch
## Model validation for 15 tokens : 100.0% similarity, with 15 matched token, and 0 token mismatch
## Model validation for 20 tokens : 100.0% similarity, with 20 matched token, and 0 token mismatch
## Model validation for 25 tokens : 100.0% similarity, with 25 matched token, and 0 token mismatch
## Model validation for 30 tokens : 100.0% similarity, with 30 matched token, and 0 token mismatch
## Model validation for 35 tokens : 100.0% similarity, with 35 matched token, and 0 token mismatch
## Model validation for 40 tokens : 100.0% similarity, with 40 matched token, and 0 token mismatch
## Model validation for 45 tokens : 100.0% similari

## Tune 3 : Low ctx size (1024), memory training

- Tune 3: Low ctx size (1024), Scaling up !

In [55]:
%%script bash

########################################
# Generate the required jsonl dataset
########################################

# Reset the dataset dir
mkdir -p ../dataset
rm -rf ../dataset/*.jsonl

# Generate the various datasets
echo "## Generating word reptition dataset ##"

#
# We reduce the training set for lower word count - and shift the focus upwards
#
python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/word-2-count.jsonl 2 400 &
for i in {5..45..5} 
do
    python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/gen-word-$i-count.jsonl $i 400 & 
    python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-$i-count.jsonl $i 10 & 
done

#
# Ramping up the 50+ - 510 words dataset
# 
for i in {50..550..5} 
do
    python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/gen-word-$i-count.jsonl $i 800 & 
    python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-$i-count.jsonl $i 20 & 
done

wait
echo "## Done ##"

ls -alh ../dataset/

## Generating word reptition dataset ##
Generated JSONL file with - 10 max words, 400 samples - at ../dataset/gen-word-10-count.jsonl
Generated JSONL file with - 15 max words, 400 samples - at ../dataset/gen-word-15-count.jsonl
Generated JSONL file with - 2 max words, 400 samples - at ../dataset/word-2-count.jsonl
Generated JSONL file with - 5 max words, 400 samples - at ../dataset/gen-word-5-count.jsonl
Generated JSONL file with - 30 max words, 400 samples - at ../dataset/gen-word-30-count.jsonl
Generated JSONL file with - 20 max words, 400 samples - at ../dataset/gen-word-20-count.jsonl
Generated JSONL file with - 40 max words, 400 samples - at ../dataset/gen-word-40-count.jsonl
Generated a single JSONL file with 1298 samples (10 token repeat) - 20 max words - at ../dataset/shuffle-word-20-count.jsonl
Generated a single JSONL file with 877 samples (10 token repeat) - 30 max words - at ../dataset/shuffle-word-30-count.jsonl
Generated JSONL file with - 25 max words, 400 samples - at ..

In [56]:
# Start the finetune model training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
    python lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/v5wave-1B5-mem-template.yaml" \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - Mem-Tune ctx-1k (train-ctx=1k, {DEEPSPEED_STRAT})" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}"  \
        --trainer.callbacks.init_args.dirpath="../checkpoint/{FILENAME_PREFIX}-mem-ctx-1k/" \
        --model.lr_init=4e-4 \
        --model.lr_final=2e-4 \
        --data.max_token_size=1024 \
        --model.ctx_len=1024 \
        --model.bptt_learning_range=1 \
        --model.load_model="../model/{FILENAME_PREFIX}-mem-ctx-512.pth"

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
  rank_zero_warn(
  rank_zero_warn(f"No seed found, seed set to {seed}")
Global seed set to 95825968
[34m[1mwandb[0m: Currently logged in as: [33mpicocreator[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.15.8 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.15.4
[34m[1mwandb[0m: Run data is saved locally in [35m[1m./wandb/run-20230816_033941-pw017xyo[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mv5wave-1B5-0.1 - Mem-Tune ctx-1k (train-ctx=1k, deepspeed_stage_1)[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/picocreator/RWKV-5X-Experiments[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/picocreator/RWKV-5X-Experiments/r

In [57]:
# Lets export the model from the checkpoint
!cd "{TRAINER_DIR}" && \
    python export_checkpoint.py \
        "../checkpoint/{FILENAME_PREFIX}-mem-ctx-1k/last.ckpt" \
        "../model/{FILENAME_PREFIX}-mem-ctx-1k.pth"
!cd "{TRAINER_DIR}" && ls -alh "../model/{FILENAME_PREFIX}-mem-ctx-1k.pth"

Setting ds_accelerator to cuda (auto detect)
Processing zero checkpoint '../checkpoint/v5wave-1B5-E0_1-mem-ctx-1k/last.ckpt/checkpoint'
Detected checkpoint of type zero stage ZeroStageEnum.optimizer_states, world_size: 8
Parsing checkpoint created by deepspeed==0.9.3
Reconstructed fp32 state dict with 486 params 1515107840 elements
Saving fp32 state dict to ../model/v5wave-1B5-E0_1-mem-ctx-1k.pth
-rw-r--r-- 1 root root 5.7G Aug 16 05:33 ../model/v5wave-1B5-E0_1-mem-ctx-1k.pth


In [58]:
# Lets do a quick memory test
!export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
        python3 ../memory_script/eval_v5wavenet_memory_guided.py "{PROJECT_DIR}/model/{FILENAME_PREFIX}-mem-ctx-1k.pth"

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
###
### Model validation start ###
###
## Model validation for 5 tokens : 100.0% similarity, with 5 matched token, and 0 token mismatch
## Model validation for 10 tokens : 100.0% similarity, with 10 matched token, and 0 token mismatch
## Model validation for 15 tokens : 100.0% similarity, with 15 matched token, and 0 token mismatch
## Model validation for 20 tokens : 100.0% similarity, with 20 matched token, and 0 token mismatch
## Model validation for 25 tokens : 100.0% similarity, with 25 matched token, and 0 token mismatch
## Model validation for 30 tokens : 100.0% similarity, with 30 matched token, and 0 token mismatch
## Model validation for 35 tokens : 100.0% similarity, with 35 matched token, and 0 token mismatch
## Model validation for 40 tokens : 100.0% similarity, with 40 matched token, and 0 token mismatch
## Model validation for 45 tokens : 100.0% similari

## Tune 4 : Low ctx size (2048), memory training

- Tune 4: Low ctx size (2048), Scaling up !

In [59]:
%%script bash

########################################
# Generate the required jsonl dataset
########################################

# Reset the dataset dir
mkdir -p ../dataset
rm -rf ../dataset/*.jsonl

# Generate the various datasets
echo "## Generating word reptition dataset ##"

#
# We reduce the training set for lower word count - and shift the focus upwards
#
python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/word-2-count.jsonl 2 100 &
for i in {5..100..5} 
do
    python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/gen-word-$i-count.jsonl $i 100 & 
    python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-$i-count.jsonl $i 1 & 
done

#
# Ramping up the 105+ - 1050 words dataset
# 
for i in {105..2000..5} 
do
    python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/gen-word-$i-count.jsonl $i 200 & 
    python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-$i-count.jsonl $i 20 & 
done

wait
echo "## Done ##"

ls -alh ../dataset/

## Generating word reptition dataset ##
Generated JSONL file with - 5 max words, 100 samples - at ../dataset/gen-word-5-count.jsonl
Generated JSONL file with - 2 max words, 100 samples - at ../dataset/word-2-count.jsonl
Generated JSONL file with - 15 max words, 100 samples - at ../dataset/gen-word-15-count.jsonl
Generated a single JSONL file with 64 samples (1 token repeat) - 40 max words - at ../dataset/shuffle-word-40-count.jsonl
Generated JSONL file with - 35 max words, 100 samples - at ../dataset/gen-word-35-count.jsonl
Generated a single JSONL file with 175 samples (1 token repeat) - 15 max words - at ../dataset/shuffle-word-15-count.jsonl
Generated a single JSONL file with 74 samples (1 token repeat) - 35 max words - at ../dataset/shuffle-word-35-count.jsonl
Generated JSONL file with - 40 max words, 100 samples - at ../dataset/gen-word-40-count.jsonl
Generated JSONL file with - 25 max words, 100 samples - at ../dataset/gen-word-25-count.jsonl
Generated JSONL file with - 45 max wo

In [60]:
# Start the finetune model training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
    python lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/v5wave-1B5-mem-template.yaml" \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - Mem-Tune ctx-2k (train-ctx=2k, {DEEPSPEED_STRAT})" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}"  \
        --trainer.callbacks.init_args.dirpath="../checkpoint/{FILENAME_PREFIX}-mem-ctx-2k/" \
        --model.lr_init=3e-4 \
        --model.lr_final=1e-4 \
        --data.max_token_size=2048 \
        --model.ctx_len=2048 \
        --model.bptt_learning_range=1 \
        --model.load_model="../model/{FILENAME_PREFIX}-mem-ctx-1k.pth"

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
  rank_zero_warn(
  rank_zero_warn(f"No seed found, seed set to {seed}")
Global seed set to 3598725636
[34m[1mwandb[0m: Currently logged in as: [33mpicocreator[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.15.8 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.15.4
[34m[1mwandb[0m: Run data is saved locally in [35m[1m./wandb/run-20230816_053348-sa9zm5nr[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mv5wave-1B5-0.1 - Mem-Tune ctx-2k (train-ctx=2k, deepspeed_stage_1)[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/picocreator/RWKV-5X-Experiments[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/picocreator/RWKV-5X-Experiments

In [61]:
# Lets export the model from the checkpoint
!cd "{TRAINER_DIR}" && \
    python export_checkpoint.py \
        "../checkpoint/{FILENAME_PREFIX}-mem-ctx-2k/last.ckpt" \
        "../model/{FILENAME_PREFIX}-mem-ctx-2k.pth"
!cd "{TRAINER_DIR}" && ls -alh "../model/{FILENAME_PREFIX}-mem-ctx-2k.pth"

Setting ds_accelerator to cuda (auto detect)
Processing zero checkpoint '../checkpoint/v5wave-1B5-E0_1-mem-ctx-2k/last.ckpt/checkpoint'
Detected checkpoint of type zero stage ZeroStageEnum.optimizer_states, world_size: 8
Parsing checkpoint created by deepspeed==0.9.3
Reconstructed fp32 state dict with 486 params 1515107840 elements
Saving fp32 state dict to ../model/v5wave-1B5-E0_1-mem-ctx-2k.pth
-rw-r--r-- 1 root root 5.7G Aug 16 07:32 ../model/v5wave-1B5-E0_1-mem-ctx-2k.pth


In [62]:
# Lets do a quick memory test
!export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
        python3 ../memory_script/eval_v5wavenet_memory_guided.py "{PROJECT_DIR}/model/{FILENAME_PREFIX}-mem-ctx-2k.pth"

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
###
### Model validation start ###
###
## Model validation for 5 tokens : 100.0% similarity, with 5 matched token, and 0 token mismatch
## Model validation for 10 tokens : 100.0% similarity, with 10 matched token, and 0 token mismatch
## Model validation for 15 tokens : 100.0% similarity, with 15 matched token, and 0 token mismatch
## Model validation for 20 tokens : 100.0% similarity, with 20 matched token, and 0 token mismatch
## Model validation for 25 tokens : 100.0% similarity, with 25 matched token, and 0 token mismatch
## Model validation for 30 tokens : 100.0% similarity, with 30 matched token, and 0 token mismatch
## Model validation for 35 tokens : 100.0% similarity, with 35 matched token, and 0 token mismatch
## Model validation for 40 tokens : 100.0% similarity, with 40 matched token, and 0 token mismatch
## Model validation for 45 tokens : 100.0% similari

## Tune 5 : Ramping up the ctx size (4096), memory training

- Tune 5: Mid ctx size (4096), Scaling up!

In [63]:
%%script bash

########################################
# Generate the required jsonl dataset
########################################

# Reset the dataset dir
mkdir -p ../dataset
rm -rf ../dataset/*.jsonl

# Generate the various datasets
echo "## Generating word reptition dataset ##"

#
# We reduce the training set for < 50 words - and shift the focus upwards
# (aka 50-100 token * 2 : ~100 - 250 token ctx len)
#
python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/word-2-count.jsonl 2 100 &
for i in {5..500..5} 
do
    python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/gen-word-$i-count.jsonl $i 100 & 
    python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-$i-count.jsonl $i 1 & 
done

#
# Ramping up the 50+ - 2100 words dataset
# 
for i in {505..4000..5} 
do
    python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/gen-word-$i-count.jsonl $i 200 & 
    python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-$i-count.jsonl $i 20 & 
done

wait
echo "## Done ##"

ls -alh ../dataset/

## Generating word reptition dataset ##
Generated JSONL file with - 2 max words, 100 samples - at ../dataset/word-2-count.jsonl
Generated a single JSONL file with 106 samples (1 token repeat) - 25 max words - at ../dataset/shuffle-word-25-count.jsonl
Generated JSONL file with - 20 max words, 100 samples - at ../dataset/gen-word-20-count.jsonl
Generated JSONL file with - 30 max words, 100 samples - at ../dataset/gen-word-30-count.jsonl
Generated JSONL file with - 40 max words, 100 samples - at ../dataset/gen-word-40-count.jsonl
Generated a single JSONL file with 552 samples (1 token repeat) - 5 max words - at ../dataset/shuffle-word-5-count.jsonl
Generated JSONL file with - 25 max words, 100 samples - at ../dataset/gen-word-25-count.jsonl
Generated JSONL file with - 10 max words, 100 samples - at ../dataset/gen-word-10-count.jsonl
Generated a single JSONL file with 30 samples (1 token repeat) - 90 max words - at ../dataset/shuffle-word-90-count.jsonl
Generated JSONL file with - 35 max w

In [64]:
# Start the finetune model training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
    python lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/v5wave-1B5-mem-template.yaml" \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - Mem-Tune ctx-4k (train-ctx=4k, {DEEPSPEED_STRAT})" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}"  \
        --trainer.callbacks.init_args.dirpath="../checkpoint/{FILENAME_PREFIX}-mem-ctx-4k/" \
        --model.lr_init=3e-4 \
        --model.lr_final=1e-4 \
        --data.max_token_size=4096 \
        --model.ctx_len=4096 \
        --model.bptt_learning_range=1 \
        --model.load_model="../model/{FILENAME_PREFIX}-mem-ctx-1k.pth"

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
  rank_zero_warn(
  rank_zero_warn(f"No seed found, seed set to {seed}")
Global seed set to 3200350149
[34m[1mwandb[0m: Currently logged in as: [33mpicocreator[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.15.8 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.15.4
[34m[1mwandb[0m: Run data is saved locally in [35m[1m./wandb/run-20230816_073320-vvucmnk7[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mv5wave-1B5-0.1 - Mem-Tune ctx-4k (train-ctx=4k, deepspeed_stage_1)[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/picocreator/RWKV-5X-Experiments[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/picocreator/RWKV-5X-Experiments

In [65]:
# Lets export the model from the checkpoint
!cd "{TRAINER_DIR}" && \
    python export_checkpoint.py \
        "../checkpoint/{FILENAME_PREFIX}-mem-ctx-4k/last.ckpt" \
        "../model/{FILENAME_PREFIX}-mem-ctx-4k.pth"
!cd "{TRAINER_DIR}" && ls -alh "../model/{FILENAME_PREFIX}-mem-ctx-4k.pth"

Setting ds_accelerator to cuda (auto detect)
Processing zero checkpoint '../checkpoint/v5wave-1B5-E0_1-mem-ctx-4k/last.ckpt/checkpoint'
Detected checkpoint of type zero stage ZeroStageEnum.optimizer_states, world_size: 8
Parsing checkpoint created by deepspeed==0.9.3
Reconstructed fp32 state dict with 486 params 1515107840 elements
Saving fp32 state dict to ../model/v5wave-1B5-E0_1-mem-ctx-4k.pth
-rw-r--r-- 1 root root 5.7G Aug 16 13:08 ../model/v5wave-1B5-E0_1-mem-ctx-4k.pth


In [66]:
# Lets do a quick memory test
!export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
        python3 ../memory_script/eval_v5wavenet_memory_guided.py "{PROJECT_DIR}/model/{FILENAME_PREFIX}-mem-ctx-4k.pth"

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
###
### Model validation start ###
###
## Model validation for 5 tokens : 100.0% similarity, with 5 matched token, and 0 token mismatch
## Model validation for 10 tokens : 100.0% similarity, with 10 matched token, and 0 token mismatch
## Model validation for 15 tokens : 100.0% similarity, with 15 matched token, and 0 token mismatch
## Model validation for 20 tokens : 100.0% similarity, with 20 matched token, and 0 token mismatch
## Model validation for 25 tokens : 100.0% similarity, with 25 matched token, and 0 token mismatch
## Model validation for 30 tokens : 100.0% similarity, with 30 matched token, and 0 token mismatch
## Model validation for 35 tokens : 100.0% similarity, with 35 matched token, and 0 token mismatch
## Model validation for 40 tokens : 100.0% similarity, with 40 matched token, and 0 token mismatch
## Model validation for 45 tokens : 100.0% similari

## Tune 6 : Ramping up the ctx size (8192), memory training

- Tune 6: Large ctx size (8192), Scaling up!

In [67]:
%%script bash

########################################
# Generate the required jsonl dataset
########################################

# Reset the dataset dir
mkdir -p ../dataset
rm -rf ../dataset/*.jsonl

# Generate the various datasets
echo "## Generating word reptition dataset ##"

#
# We reduce the training set for < 50 words - and shift the focus upwards
# (aka 50-100 token * 2 : ~100 - 250 token ctx len)
#
python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/word-2-count.jsonl 2 50 &
for i in {5..1000..5} 
do
    python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/gen-word-$i-count.jsonl $i 50 & 
    python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-$i-count.jsonl $i 1 & 
done

#
# Ramping up the 50+ - 4200 words dataset
# 
for i in {1100..8000..100} 
do
    python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/gen-word-$i-count.jsonl $i 2000 & 
    python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-$i-count.jsonl $i 20 & 
done

wait
echo "## Done ##"

ls -lh ../dataset/

## Generating word reptition dataset ##
Generated JSONL file with - 2 max words, 50 samples - at ../dataset/word-2-count.jsonl
Generated JSONL file with - 5 max words, 50 samples - at ../dataset/gen-word-5-count.jsonl
Generated JSONL file with - 10 max words, 50 samples - at ../dataset/gen-word-10-count.jsonl
Generated JSONL file with - 30 max words, 50 samples - at ../dataset/gen-word-30-count.jsonl
Generated JSONL file with - 25 max words, 50 samples - at ../dataset/gen-word-25-count.jsonl
Generated JSONL file with - 15 max words, 50 samples - at ../dataset/gen-word-15-count.jsonl
Generated a single JSONL file with 129 samples (1 token repeat) - 20 max words - at ../dataset/shuffle-word-20-count.jsonl
Generated a single JSONL file with 180 samples (1 token repeat) - 15 max words - at ../dataset/shuffle-word-15-count.jsonl
Generated a single JSONL file with 108 samples (1 token repeat) - 25 max words - at ../dataset/shuffle-word-25-count.jsonl
Generated a single JSONL file with 86 sam

In [68]:
# Start the finetune model training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
    python lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/v5wave-1B5-mem-template.yaml" \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - Mem-Tune ctx-8k (train-ctx=4k, {DEEPSPEED_STRAT})" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}"  \
        --trainer.callbacks.init_args.dirpath="../checkpoint/{FILENAME_PREFIX}-mem-ctx-8k/" \
        --model.lr_init=3e-4 \
        --model.lr_final=1e-4 \
        --data.max_token_size=8192 \
        --model.ctx_len=4096 \
        --model.bptt_learning_range=2 \
        --model.load_model="../model/{FILENAME_PREFIX}-mem-ctx-4k.pth"

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
  rank_zero_warn(
  rank_zero_warn(f"No seed found, seed set to {seed}")
Global seed set to 3002262312
[34m[1mwandb[0m: Currently logged in as: [33mpicocreator[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.15.8 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.15.4
[34m[1mwandb[0m: Run data is saved locally in [35m[1m./wandb/run-20230816_130949-mh3nzrrl[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mv5wave-1B5-0.1 - Mem-Tune ctx-8k (train-ctx=4k, deepspeed_stage_1)[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/picocreator/RWKV-5X-Experiments[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/picocreator/RWKV-5X-Experiments

In [69]:
# Lets export the model from the checkpoint
!cd "{TRAINER_DIR}" && \
    python export_checkpoint.py \
        "../checkpoint/{FILENAME_PREFIX}-mem-ctx-8k/last.ckpt" \
        "../model/{FILENAME_PREFIX}-mem-ctx-8k.pth"
!cd "{TRAINER_DIR}" && ls -alh "../model/{FILENAME_PREFIX}-mem-ctx-8k.pth"

Setting ds_accelerator to cuda (auto detect)
Processing zero checkpoint '../checkpoint/v5wave-1B5-E0_1-mem-ctx-8k/last.ckpt/checkpoint'
Detected checkpoint of type zero stage ZeroStageEnum.optimizer_states, world_size: 8
Parsing checkpoint created by deepspeed==0.9.3
Reconstructed fp32 state dict with 486 params 1515107840 elements
Saving fp32 state dict to ../model/v5wave-1B5-E0_1-mem-ctx-8k.pth
-rw-r--r-- 1 root root 5.7G Aug 16 21:43 ../model/v5wave-1B5-E0_1-mem-ctx-8k.pth


In [70]:
# Lets do a quick memory test
!export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
        python3 ../memory_script/eval_v5wavenet_memory_guided.py "{PROJECT_DIR}/model/{FILENAME_PREFIX}-mem-ctx-8k.pth"

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
###
### Model validation start ###
###
## Model validation for 5 tokens : 100.0% similarity, with 5 matched token, and 0 token mismatch
## Model validation for 10 tokens : 100.0% similarity, with 10 matched token, and 0 token mismatch
## Model validation for 15 tokens : 100.0% similarity, with 15 matched token, and 0 token mismatch
## Model validation for 20 tokens : 100.0% similarity, with 20 matched token, and 0 token mismatch
## Model validation for 25 tokens : 100.0% similarity, with 25 matched token, and 0 token mismatch
## Model validation for 30 tokens : 100.0% similarity, with 30 matched token, and 0 token mismatch
## Model validation for 35 tokens : 100.0% similarity, with 35 matched token, and 0 token mismatch
## Model validation for 40 tokens : 100.0% similarity, with 40 matched token, and 0 token mismatch
## Model validation for 45 tokens : 100.0% similari

In [71]:
# Lets do a quick memory test
!export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
        python3 ../memory_script/eval_v5wavenet_memory_guided.py "{PROJECT_DIR}/model/{FILENAME_PREFIX}-mem-ctx-8k.pth" "none" 1100 4000

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
###
### Model validation start ###
###
## Model validation for 1100 tokens : 97.81818181818181% similarity, with 1076 matched token, and 24 token mismatch
## Model validation for 1150 tokens : 97.30434782608695% similarity, with 1119 matched token, and 31 token mismatch
## Model validation for 1200 tokens : 97.08333333333333% similarity, with 1165 matched token, and 35 token mismatch
## Model validation for 1250 tokens : 97.11999999999999% similarity, with 1214 matched token, and 36 token mismatch
## Model validation for 1300 tokens : 97.07692307692307% similarity, with 1262 matched token, and 38 token mismatch
## Model validation for 1350 tokens : 96.74074074074073% similarity, with 1306 matched token, and 44 token mismatch
## Model validation for 1400 tokens : 96.35714285714285% similarity, with 1349 matched token, and 51 token mismatch
## Model validation for 1450 

In [72]:
%%script bash

########################################
# Generate the required jsonl dataset
########################################

# Reset the dataset dir
mkdir -p ../dataset
rm -rf ../dataset/*.jsonl

# Generate the various datasets
echo "## Generating word reptition dataset ##"

#
# We reduce the training set for < 50 words - and shift the focus upwards
# (aka 50-100 token * 2 : ~100 - 250 token ctx len)
#
python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/word-2-count.jsonl 2 50 &
for i in {5..1000..50} 
do
    python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/gen-word-$i-count.jsonl $i 500 & 
    python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-$i-count.jsonl $i 10 & 
done

#
# Ramping up the 50+ - 4200 words dataset
# 
for i in {1100..12000..100} 
do
    python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/gen-word-$i-count.jsonl $i 2000 & 
    python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-$i-count.jsonl $i 20 & 
done

wait
echo "## Done ##"

ls -lh ../dataset/

## Generating word reptition dataset ##
Generated a single JSONL file with 205 samples (10 token repeat) - 105 max words - at ../dataset/shuffle-word-105-count.jsonl
Generated JSONL file with - 2 max words, 50 samples - at ../dataset/word-2-count.jsonl
Generated JSONL file with - 5 max words, 500 samples - at ../dataset/gen-word-5-count.jsonl
Generated a single JSONL file with 50 samples (10 token repeat) - 555 max words - at ../dataset/shuffle-word-555-count.jsonl
Generated a single JSONL file with 110 samples (10 token repeat) - 205 max words - at ../dataset/shuffle-word-205-count.jsonl
Generated a single JSONL file with 484 samples (10 token repeat) - 55 max words - at ../dataset/shuffle-word-55-count.jsonl
Generated a single JSONL file with 60 samples (10 token repeat) - 405 max words - at ../dataset/shuffle-word-405-count.jsonl
Generated a single JSONL file with 41 samples (10 token repeat) - 655 max words - at ../dataset/shuffle-word-655-count.jsonl
Generated a single JSONL file 

In [None]:
# Start the finetune model training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
    python lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/v5wave-1B5-mem-template.yaml" \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - Mem-Tune ctx-16k (train-ctx=4k, {DEEPSPEED_STRAT})" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}"  \
        --trainer.callbacks.init_args.dirpath="../checkpoint/{FILENAME_PREFIX}-mem-ctx-16k/" \
        --model.lr_init=3e-4 \
        --model.lr_final=1e-4 \
        --data.max_token_size=16384 \
        --model.ctx_len=4096 \
        --model.bptt_learning_range=4 \
        --model.load_model="../model/{FILENAME_PREFIX}-mem-ctx-8k.pth"

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
  rank_zero_warn(
  rank_zero_warn(f"No seed found, seed set to {seed}")
Global seed set to 1381925662
[34m[1mwandb[0m: Currently logged in as: [33mpicocreator[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.15.8 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.15.4
[34m[1mwandb[0m: Run data is saved locally in [35m[1m./wandb/run-20230816_214509-dbxzt0py[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mv5wave-1B5-0.1 - Mem-Tune ctx-16k (train-ctx=4k, deepspeed_stage_1)[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/picocreator/RWKV-5X-Experiments[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/picocreator/RWKV-5X-Experiment