# RWKV CodeParrot + Enwiki (& instruct)
This model is a custom model containing
- 24 layers
- 2048 embedding size

And follows up on the memory tuned 4 model, and applies code training

# Basic Setup

In [5]:
# First lets setup the various directories, and get the model we need
!mkdir -p ../../../../model/
!mkdir -p ../../../../datapath/
!mkdir -p ../../../../checkpoint/
!cd ../../../../model/ && wget -nc https://huggingface.co/picocreator/memory-size-experiment-for-rwkv/resolve/main/TokenShift-C-Stage2.pth
!ls -alh ../../../../model/TokenShift-C-Stage2.pth

# The various other stages, if you want to skip stuff

File ‘TokenShift-C-Stage2.pth’ already there; not retrieving.

-rw-r--r-- 1 root root 5.7G Jul 21 16:02 ../../../../model/TokenShift-C-Stage2.pth


In [6]:
DEEPSPEED_STRAT="deepspeed_stage_1"
GPU_DEVICES="[4,5,6,7]"
ENABLE_WANDB=True
WANDB_PREFIX="CodeShift-C"

print("DEEPSPEED_STRAT:", DEEPSPEED_STRAT)
print("ENABLE_WANDB:", ENABLE_WANDB)
print("GPU_DEVICES:", GPU_DEVICES)

if ENABLE_WANDB:
    WANDB_MODE="online"
else:
    WANDB_MODE="disabled"

# Computing the notebook, and various paths
import os
NOTEBOOK_DIR=os.path.dirname(os.path.abspath("__file__"))
PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, "../../../../"))
TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, "./RWKV-v4wavenet/"))
INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, "./RWKV-v4wavenet/"))

print("NOTEBOOK_DIR:", NOTEBOOK_DIR)
print("INFERENCE_DIR:", INFERENCE_DIR)
print("TRAINER_DIR:", TRAINER_DIR)
print("PROJECT_DIR:", PROJECT_DIR)

DEEPSPEED_STRAT: deepspeed_stage_1
ENABLE_WANDB: True
GPU_DEVICES: [4,5,6,7]
NOTEBOOK_DIR: /root/rwkv5x-tokenshift-exp-A/notebook/experiment/tokenshift-exp/CodeShift
INFERENCE_DIR: /root/rwkv5x-tokenshift-exp-A/RWKV-v4wavenet
TRAINER_DIR: /root/rwkv5x-tokenshift-exp-A/RWKV-v4wavenet
PROJECT_DIR: /root/rwkv5x-tokenshift-exp-A


## CodeParrot training

In [9]:
# Lets preload the requried dataset
!cd "{TRAINER_DIR}" && \
    python3 preload_datapath.py "{NOTEBOOK_DIR}/CodeShift-C-Enwiki-Parrot.yaml"

Found cached dataset json (/root/.cache/huggingface/datasets/codeparrot___json/codeparrot--codeparrot-clean-fb728533b9673c8b/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)
100%|█████████████████████████████████████████████| 1/1 [00:02<00:00,  2.41s/it]
Loading cached processed dataset at /root/.cache/huggingface/datasets/codeparrot___json/codeparrot--codeparrot-clean-fb728533b9673c8b/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-c29625cf93303090_*_of_00064.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/codeparrot___json/codeparrot--codeparrot-clean-fb728533b9673c8b/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-2a1230f1bc0c64bf_*_of_00064.arrow
Loading cached split indices for dataset at /root/.cache/huggingface/datasets/codeparrot___json/codeparrot--codeparrot-clean-fb728533b9673c8b/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-6f52cf55daacc

In [10]:
# Start the foundation model training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    python lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/CodeShift-C-Enwiki-Parrot.yaml" \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - Enwiki-Parrot (ctx=4096, {DEEPSPEED_STRAT})" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}" 

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
  rank_zero_warn(f"No seed found, seed set to {seed}")
Global seed set to 4107967262
[34m[1mwandb[0m: Currently logged in as: [33mpicocreator[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.15.7 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.15.4
[34m[1mwandb[0m: Run data is saved locally in [35m[1m./wandb/run-20230727_065722-ntraguqo[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mCodeShift-C - Enwiki-Parrot (ctx=4096, deepspeed_stage_1)[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/picocreator/RWKV-5X-Experiments[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/picocreator/RWKV-5X-Experiments/runs/ntraguqo[0m
Using /r

In [11]:
# Lets export the model from the checkpoint
!cd "{TRAINER_DIR}" && \
    python export_checkpoint.py "../checkpoint/CodeShift-C-Enwiki-Parrot/last.ckpt" "../model/CodeShift-C-Enwiki-Parrot.pth"
!cd "{TRAINER_DIR}" && ls -alh "../model/CodeShift-C-Enwiki-Parrot.pth"

Setting ds_accelerator to cuda (auto detect)
Processing zero checkpoint '../checkpoint/CodeShift-C-Enwiki-Parrot/last.ckpt/checkpoint'
Detected checkpoint of type zero stage ZeroStageEnum.optimizer_states, world_size: 4
Parsing checkpoint created by deepspeed==0.9.3
Reconstructed fp32 state dict with 438 params 1515106304 elements
Saving fp32 state dict to ../model/CodeShift-C-Enwiki-Parrot.pth
-rw-r--r-- 1 root root 5.7G Jul 29 03:18 ../model/CodeShift-C-Enwiki-Parrot.pth


In [12]:
# # Lets do a quick dragon prompt validation
!cd "{INFERENCE_DIR}" && python3 dragon_test.py ../model/CodeShift-C-Enwiki-Parrot.pth "cuda fp32"

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Creating extension directory /root/.cache/torch_extensions/py311_cu118/wkv_1024_bf16...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/wkv_1024_bf16/build.ninja...
Building extension module wkv_1024_bf16...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
[1/3] /usr/local/cuda/bin/nvcc  -DTORCH_EXTENSION_NAME=wkv_1024_bf16 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /usr/local/lib/python3.11/dist-packages/torch/include -isystem /usr/local/lib/python3.11/dist-packages/torch/include/torch/csrc/api/include -isystem /usr/local/lib/python3.11/dist-packages/torch/include/TH -i

In [13]:
# Lets do a quick memory test (let see if this behaviour is removed)
!python3 ../memory_script/eval_model_memory_guided.py "{PROJECT_DIR}/model/CodeShift-C-Enwiki-Parrot.pth"

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/wkv_1024_bf16/build.ninja...
Building extension module wkv_1024_bf16...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Loading extension module wkv_1024_bf16...
###
### Model validation start ###
###
## Model validation for 5 tokens : 20.0% similarity, with 1 matched token, and 4 token mismatch
## Model validation for 10 tokens : 20.0% similarity, with 2 matched token, and 8 token mismatch
## Model validation for 15 tokens : 13.333333333333334% similarity, with 2 matched token, and 13 token mismatch
## Model validation for 20 tokens : 10.0% similarity, with 2 matched token, and 18 token mismatch
## M