# Hugging Face builder test

In [1]:
# Get the current script directory, from the notebook
import os
notebook_dir = os.getcwd()
print("Notebook directory:", notebook_dir)

model_filename = "qwerky7-7B"
model_file = os.path.join(notebook_dir, ".model", f"{model_filename}.pth")
print("Model file path:", model_file)

# Check if the model file exists
if os.path.isfile(model_file) is False:
    raise Exception("Model file does not exist")

# Get the project directory two levels up
project_dir = os.path.dirname(os.path.dirname(notebook_dir))
print("Project directory:", project_dir)

# Output build directory
output_dir = os.path.join(notebook_dir, f".hf_build/{model_filename}/")
print("Output directory:", output_dir)

Notebook directory: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_qwerky
Model file path: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_qwerky/.model/qwerky7-7B.pth
Project directory: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block
Output directory: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_qwerky/.hf_build/qwerky7-7B/


In [2]:
# Empty the output directory, if it exists
if os.path.isdir(output_dir):
    import shutil
    print("Removing existing output directory")
    shutil.rmtree(output_dir)
    
# Run the hf_builder.py
!python3 "$project_dir/hf_builder/hf_builder.py" --model_class "v7_qwerky" "$model_file" "$output_dir"

-----------------------------
Converting RWKV model to HuggingFace format...
Model Class     : v7_qwerky
Model Source    : /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_qwerky/.model/qwerky7-7B.pth
Tokenizer Type  : auto
Output Directory: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_qwerky/.hf_build/qwerky7-7B/
-----------------------------
Building rwkv_block into HF code ...
Loading model weights raw state ...
Loading model config from weights ...
-----------------------------
Model Configuration:
{'vocab_size': 152064, 'num_hidden_layers': 28, 'hidden_size': 3584, 'hidden_size_att': 512, 'hidden_size_ffn': 18944, 'head_size': 128, 'tmix_backend': 'auto', 'init_state_wkv': False, 'forward_chunk_size': 4096, 'dropout_rate': 0.0, 'use_cache': True, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': None, 'use_bfloat16': True, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_wor

# Basic HELLO WORLD

In [3]:
# Load the built model, using the transformers library
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig

# Validating the config and tokenizer are built correctly
config = AutoConfig.from_pretrained(output_dir, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(output_dir, trust_remote_code=True)

# Move the model to the GPU
RUN_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# Build the model itself
model = AutoModelForCausalLM.from_pretrained(output_dir, trust_remote_code=True, tmix_backend="fused_fla", device=RUN_DEVICE)
model.to(RUN_DEVICE)
print("Model and tokenizer loaded successfully")

# Print the device being used
print("Running on device:", RUN_DEVICE)

# Lets generate some text, using the model on the GPU
dragon_prompt = "\nIn a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese."
hellow_prompt = "HELLO WORLD"

with torch.no_grad():
    print("---------------------------------")
    print(f"Prompt: {hellow_prompt}")
    inputs = tokenizer(hellow_prompt, return_tensors="pt").to(RUN_DEVICE)
    outputs = model.generate(**inputs)
    print("Generated text:", tokenizer.decode(outputs[0], skip_special_tokens=True))
    print("---------------------------------")
    print(f"Prompt: {dragon_prompt}")
    inputs = tokenizer(dragon_prompt, return_tensors="pt").to(RUN_DEVICE)
    outputs = model.generate(**inputs)
    print("Generated text:", tokenizer.decode(outputs[0], skip_special_tokens=True))
    print("---------------------------------")

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  6.36it/s]
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Model and tokenizer loaded successfully
Running on device: cuda
---------------------------------
Prompt: HELLO WORLD


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Generated text: HELLO WORLD
---------

This is a simple hello world program in C. It prints "Hello, World!" to
---------------------------------
Prompt: 
In a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese.
Generated text: 
In a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese. The dragons were able to communicate with the researchers, and they explained that they had been living in the
---------------------------------


# MMLU validation testing (smaller set)
**(this is not a substitute for lm-eval-harness : the score is counted differently)**

In [2]:
# MMLU tester directory
mmlu_test_dir = os.path.join(project_dir, "test/mmlu")

# Run the test dataset builder, optional:  --use_validation_set
!python3 {mmlu_test_dir}/BuildTestMMLU.py --hf_model "$output_dir" --n_shot 0 --use_validation_set

## Using HF model tokenizer: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_qwerky/.hf_build/qwerky7-7B/
## Building MMLU cached dataset (n_shot=0,tokenizer=Qwen2TokenizerFast): /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/mmlu/.mmlu_cache/mmlu-val-t_Qwen2TokenizerFast-n_0-p_0-c_16-r1.pth
## Building dataset for validation subject (n_shot=0): all
## Dataset is ready for validation subject (n_shot=0): all
## Longest prompt token length: 768
## Padding to target prompt length: 768
## Dataset is padded for validation subject (n_shot=0): all
## Saving MMLU dataset cache (n_shot=0,tokenizer=Qwen2TokenizerFast): /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/mmlu/.mmlu_cache/mmlu-val-t_Qwen2TokenizerFast-n_0-p_0-c_16-r1.pth
## Done: Dataset has been built and cached


In [3]:
# Run the HF based MMLU tester, with the cuda kernel
!python3 {mmlu_test_dir}/RunTestMMLU.py "$output_dir" --batch_size 8 --n_shot 0 --use_validation_set --tmix_backend "cuda"

------------------------------------------------
## Loading HF model: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_qwerky/.hf_build/qwerky7-7B/
Loading checkpoint shards: 100%|██████████████████| 4/4 [00:00<00:00,  7.70it/s]
------------------------------------------------
## Preparing the dataset
## Loading MMLU cached dataset (n_shot=0,tokenizer=Qwen2TokenizerFast): /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/mmlu/.mmlu_cache/mmlu-val-t_Qwen2TokenizerFast-n_0-p_0-c_16-r1.pth
## Done: Dataset has been built and cached
------------------------------------------------
## Starting the MMLU test ...
### Running MMLU test : all (count=1531, batches=192) ...
Using /home/recursal/.cache/torch_extensions/py312_cu121 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/recursal/.cache/torch_extensions/py312_cu121/state_wind_backstepping/build.ninja...
If this is not desired, please set os.environ['TORCH_CU

In [4]:
# Run the HF based MMLU tester, with the triton kernel
!python3 {mmlu_test_dir}/RunTestMMLU.py "$output_dir" --batch_size 8 --n_shot 0 --use_validation_set --tmix_backend "triton"

------------------------------------------------
## Loading HF model: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_qwerky/.hf_build/qwerky7-7B/
Loading checkpoint shards: 100%|██████████████████| 4/4 [00:01<00:00,  3.93it/s]
------------------------------------------------
## Preparing the dataset
## Loading MMLU cached dataset (n_shot=0,tokenizer=Qwen2TokenizerFast): /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/mmlu/.mmlu_cache/mmlu-val-t_Qwen2TokenizerFast-n_0-p_0-c_16-r1.pth
## Done: Dataset has been built and cached
------------------------------------------------
## Starting the MMLU test ...
### Running MMLU test : all (count=1531, batches=192) ...
#### all - accuracy=0.6421 , probability=0.5407
------------------------------------------------
### MMLU overall test result : accuracy=0.6421 , probability=0.5407
------------------------------------------------


In [5]:
# Run the HF based MMLU tester, with the triton kernel
!python3 {mmlu_test_dir}/RunTestMMLU.py "$output_dir" --batch_size 8 --n_shot 0 --use_validation_set --tmix_backend "triton_bighead"

------------------------------------------------
## Loading HF model: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_qwerky/.hf_build/qwerky7-7B/
Loading checkpoint shards: 100%|██████████████████| 4/4 [00:00<00:00,  4.11it/s]
------------------------------------------------
## Preparing the dataset
## Loading MMLU cached dataset (n_shot=0,tokenizer=Qwen2TokenizerFast): /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/mmlu/.mmlu_cache/mmlu-val-t_Qwen2TokenizerFast-n_0-p_0-c_16-r1.pth
## Done: Dataset has been built and cached
------------------------------------------------
## Starting the MMLU test ...
### Running MMLU test : all (count=1531, batches=192) ...
#### all - accuracy=0.6434 , probability=0.5408
------------------------------------------------
### MMLU overall test result : accuracy=0.6434 , probability=0.5408
------------------------------------------------


In [6]:
# Run the HF based MMLU tester, with the triton kernel
!python3 {mmlu_test_dir}/RunTestMMLU.py "$output_dir" --batch_size 8 --n_shot 0 --use_validation_set --tmix_backend "fla"

------------------------------------------------
## Loading HF model: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_qwerky/.hf_build/qwerky7-7B/
Loading checkpoint shards: 100%|██████████████████| 4/4 [00:00<00:00,  4.34it/s]
------------------------------------------------
## Preparing the dataset
## Loading MMLU cached dataset (n_shot=0,tokenizer=Qwen2TokenizerFast): /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/mmlu/.mmlu_cache/mmlu-val-t_Qwen2TokenizerFast-n_0-p_0-c_16-r1.pth
## Done: Dataset has been built and cached
------------------------------------------------
## Starting the MMLU test ...
### Running MMLU test : all (count=1531, batches=192) ...
#### all - accuracy=0.6421 , probability=0.5396
------------------------------------------------
### MMLU overall test result : accuracy=0.6421 , probability=0.5396
------------------------------------------------


In [7]:
# Run the HF based MMLU tester, with the triton kernel
!python3 {mmlu_test_dir}/RunTestMMLU.py "$output_dir" --batch_size 8 --n_shot 0 --use_validation_set --tmix_backend "fla_fused"

------------------------------------------------
## Loading HF model: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_qwerky/.hf_build/qwerky7-7B/
Loading checkpoint shards: 100%|██████████████████| 4/4 [00:00<00:00,  5.51it/s]
------------------------------------------------
## Preparing the dataset
## Loading MMLU cached dataset (n_shot=0,tokenizer=Qwen2TokenizerFast): /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/mmlu/.mmlu_cache/mmlu-val-t_Qwen2TokenizerFast-n_0-p_0-c_16-r1.pth
## Done: Dataset has been built and cached
------------------------------------------------
## Starting the MMLU test ...
### Running MMLU test : all (count=1531, batches=192) ...
#### all - accuracy=0.6466 , probability=0.5405
------------------------------------------------
### MMLU overall test result : accuracy=0.6466 , probability=0.5405
------------------------------------------------


# MMLU testing 
**(this is not a substitute for lm-eval-harness : the score is counted differently)**

In [8]:
# MMLU tester directory
mmlu_test_dir = os.path.join(project_dir, "test/mmlu")

# Run the test dataset builder, optional:  --use_validation_set
!python3 {mmlu_test_dir}/BuildTestMMLU.py --hf_model "$output_dir" --n_shot 0

## Using HF model tokenizer: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_qwerky/.hf_build/qwerky7-7B/
## Building MMLU cached dataset (n_shot=0,tokenizer=Qwen2TokenizerFast): /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/mmlu/.mmlu_cache/mmlu-test-t_Qwen2TokenizerFast-n_0-p_0-c_16-r1.pth
## Building dataset for test subject (n_shot=0): abstract_algebra
## Dataset is ready for test subject (n_shot=0): abstract_algebra
## Building dataset for test subject (n_shot=0): anatomy
## Dataset is ready for test subject (n_shot=0): anatomy
## Building dataset for test subject (n_shot=0): astronomy
## Dataset is ready for test subject (n_shot=0): astronomy
## Building dataset for test subject (n_shot=0): business_ethics
## Dataset is ready for test subject (n_shot=0): business_ethics
## Building dataset for test subject (n_shot=0): clinical_knowledge
## Dataset is ready for test subject (n_shot=0): clinical_knowledge
## Building dataset for test subject (n_

In [9]:
# Run the HF based MMLU tester, with the cuda kernel (modified)
!python3 {mmlu_test_dir}/RunTestMMLU.py "$output_dir" --batch_size 8 --n_shot 0 --tmix_backend "cuda"

------------------------------------------------
## Loading HF model: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_qwerky/.hf_build/qwerky7-7B/
Loading checkpoint shards: 100%|██████████████████| 4/4 [00:01<00:00,  3.94it/s]
------------------------------------------------
## Preparing the dataset
## Loading MMLU cached dataset (n_shot=0,tokenizer=Qwen2TokenizerFast): /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/mmlu/.mmlu_cache/mmlu-test-t_Qwen2TokenizerFast-n_0-p_0-c_16-r1.pth
## Done: Dataset has been built and cached
------------------------------------------------
## Starting the MMLU test ...
### Running MMLU test : abstract_algebra (count=100, batches=13) ...
Using /home/recursal/.cache/torch_extensions/py312_cu121 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/recursal/.cache/torch_extensions/py312_cu121/state_wind_backstepping/build.ninja...
If this is not desired, please set os.envir

In [10]:
# Run the HF based MMLU tester, with the triton kernel (modified)
!python3 {mmlu_test_dir}/RunTestMMLU.py "$output_dir" --batch_size 8 --n_shot 0 --tmix_backend "triton"

------------------------------------------------
## Loading HF model: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_qwerky/.hf_build/qwerky7-7B/
Loading checkpoint shards: 100%|██████████████████| 4/4 [00:01<00:00,  3.93it/s]
------------------------------------------------
## Preparing the dataset
## Loading MMLU cached dataset (n_shot=0,tokenizer=Qwen2TokenizerFast): /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/mmlu/.mmlu_cache/mmlu-test-t_Qwen2TokenizerFast-n_0-p_0-c_16-r1.pth
## Done: Dataset has been built and cached
------------------------------------------------
## Starting the MMLU test ...
### Running MMLU test : abstract_algebra (count=100, batches=13) ...
#### abstract_algebra - accuracy=0.4100 , probability=0.3426
### Running MMLU test : anatomy (count=135, batches=17) ...
#### anatomy - accuracy=0.6593 , probability=0.5715
### Running MMLU test : astronomy (count=152, batches=19) ...
#### astronomy - accuracy=0.7632 , probability=0.

In [11]:
# Run the HF based MMLU tester, with the triton kernel (modified)
!python3 {mmlu_test_dir}/RunTestMMLU.py "$output_dir" --batch_size 8 --n_shot 0 --tmix_backend "triton_bighead"

------------------------------------------------
## Loading HF model: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_qwerky/.hf_build/qwerky7-7B/
Loading checkpoint shards: 100%|██████████████████| 4/4 [00:00<00:00,  6.00it/s]
------------------------------------------------
## Preparing the dataset
## Loading MMLU cached dataset (n_shot=0,tokenizer=Qwen2TokenizerFast): /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/mmlu/.mmlu_cache/mmlu-test-t_Qwen2TokenizerFast-n_0-p_0-c_16-r1.pth
## Done: Dataset has been built and cached
------------------------------------------------
## Starting the MMLU test ...
### Running MMLU test : abstract_algebra (count=100, batches=13) ...
#### abstract_algebra - accuracy=0.4000 , probability=0.3423
### Running MMLU test : anatomy (count=135, batches=17) ...
#### anatomy - accuracy=0.6667 , probability=0.5719
### Running MMLU test : astronomy (count=152, batches=19) ...
#### astronomy - accuracy=0.7632 , probability=0.

In [12]:
# Run the HF based MMLU tester, with the triton kernel (modified)
!python3 {mmlu_test_dir}/RunTestMMLU.py "$output_dir" --batch_size 8 --n_shot 0 --tmix_backend "fla"

------------------------------------------------
## Loading HF model: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_qwerky/.hf_build/qwerky7-7B/
Loading checkpoint shards: 100%|██████████████████| 4/4 [00:00<00:00,  5.69it/s]
------------------------------------------------
## Preparing the dataset
## Loading MMLU cached dataset (n_shot=0,tokenizer=Qwen2TokenizerFast): /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/mmlu/.mmlu_cache/mmlu-test-t_Qwen2TokenizerFast-n_0-p_0-c_16-r1.pth
## Done: Dataset has been built and cached
------------------------------------------------
## Starting the MMLU test ...
### Running MMLU test : abstract_algebra (count=100, batches=13) ...
#### abstract_algebra - accuracy=0.4000 , probability=0.3413
### Running MMLU test : anatomy (count=135, batches=17) ...
#### anatomy - accuracy=0.6593 , probability=0.5708
### Running MMLU test : astronomy (count=152, batches=19) ...
#### astronomy - accuracy=0.7697 , probability=0.

In [13]:
# Run the HF based MMLU tester, with the triton kernel (modified)
!python3 {mmlu_test_dir}/RunTestMMLU.py "$output_dir" --batch_size 8 --n_shot 0 --tmix_backend "fla_fused"

------------------------------------------------
## Loading HF model: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_qwerky/.hf_build/qwerky7-7B/
Loading checkpoint shards: 100%|██████████████████| 4/4 [00:00<00:00,  5.68it/s]
------------------------------------------------
## Preparing the dataset
## Loading MMLU cached dataset (n_shot=0,tokenizer=Qwen2TokenizerFast): /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/mmlu/.mmlu_cache/mmlu-test-t_Qwen2TokenizerFast-n_0-p_0-c_16-r1.pth
## Done: Dataset has been built and cached
------------------------------------------------
## Starting the MMLU test ...
### Running MMLU test : abstract_algebra (count=100, batches=13) ...
#### abstract_algebra - accuracy=0.3900 , probability=0.3425
### Running MMLU test : anatomy (count=135, batches=17) ...
#### anatomy - accuracy=0.6519 , probability=0.5717
### Running MMLU test : astronomy (count=152, batches=19) ...
#### astronomy - accuracy=0.7632 , probability=0.