# Hugging Face builder test

In [1]:
# Get the current script directory, from the notebook
import os
notebook_dir = os.getcwd()
print("Notebook directory:", notebook_dir)

model_filename = "v7-1B5-world"
model_file = os.path.join(notebook_dir, ".model", f"{model_filename}.pth")
print("Model file path:", model_file)

# Check if the model file exists
if os.path.isfile(model_file) is False:
    raise Exception("Model file does not exist")

# Get the project directory two levels up
project_dir = os.path.dirname(os.path.dirname(notebook_dir))
print("Project directory:", project_dir)

# Output build directory
output_dir = os.path.join(notebook_dir, f".hf_build/{model_filename}/")
print("Output directory:", output_dir)

Notebook directory: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_goose
Model file path: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_goose/.model/v7-1B5-world.pth
Project directory: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block
Output directory: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_goose/.hf_build/v7-1B5-world/


In [2]:
# Empty the output directory, if it exists
if os.path.isdir(output_dir):
    import shutil
    print("Removing existing output directory")
    shutil.rmtree(output_dir)
    
# Run the hf_builder.py
!python3 "$project_dir/hf_builder/hf_builder.py" --model_class "v7_goose" "$model_file" "$output_dir"

Removing existing output directory
-----------------------------
Converting RWKV model to HuggingFace format...
Model Class     : v7_goose
Model Source    : /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_goose/.model/v7-1B5-world.pth
Tokenizer Type  : auto
Output Directory: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_goose/.hf_build/v7-1B5-world/
-----------------------------
Building rwkv_block into HF code ...
Loading model weights raw state ...
Loading model config from weights ...
-----------------------------
Model Configuration:
{'vocab_size': 65536, 'num_hidden_layers': 24, 'hidden_size': 2048, 'hidden_size_att': 2048, 'hidden_size_ffn': 8192, 'head_size': 64, 'tmix_backend': 'auto', 'init_state_wkv': False, 'forward_chunk_size': 4096, 'dropout_rate': 0.0, 'use_cache': True, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': None, 'use_bfloat16': False, 'tf_legacy_loss': 

# Basic HELLO WORLD

In [3]:
# Load the built model, using the transformers library
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig

# Validating the config and tokenizer are built correctly
config = AutoConfig.from_pretrained(output_dir, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(output_dir, trust_remote_code=True)

# Move the model to the GPU
RUN_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# Build the model itself
model = AutoModelForCausalLM.from_pretrained(output_dir, trust_remote_code=True, tmix_backend="triton", device=RUN_DEVICE)
model.to(RUN_DEVICE)
print("Model and tokenizer loaded successfully")

# Print the device being used
print("Running on device:", RUN_DEVICE)

# Lets generate some text, using the model on the GPU
dragon_prompt = "\nIn a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese."
hellow_prompt = "HELLO WORLD"

print("---------------------------------")
print(f"Prompt: {hellow_prompt}")
inputs = tokenizer(hellow_prompt, return_tensors="pt").to(RUN_DEVICE)
outputs = model.generate(**inputs)
print("Generated text:", tokenizer.decode(outputs[0], skip_special_tokens=True))
print("---------------------------------")
print(f"Prompt: {dragon_prompt}")
inputs = tokenizer(dragon_prompt, return_tensors="pt").to(RUN_DEVICE)
outputs = model.generate(**inputs)
print("Generated text:", tokenizer.decode(outputs[0], skip_special_tokens=True))
print("---------------------------------")

  from .autonotebook import tqdm as notebook_tqdm
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Model and tokenizer loaded successfully
Running on device: cuda
---------------------------------
Prompt: HELLO WORLD


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Generated text: HELLO WORLD!
I am a newbie to this forum. I am trying to learn how to use the
---------------------------------
Prompt: 
In a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese.
Generated text: 
In a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese.
The dragons were discovered by a team of scientists led by Dr. John Smith, who was studying
---------------------------------


# MMLU validation testing (smaller set)
**(this is not a substitute for lm-eval-harness : the score is counted differently)**

In [4]:
# MMLU tester directory
mmlu_test_dir = os.path.join(project_dir, "test/mmlu")

# Run the test dataset builder, optional:  --use_validation_set
!python3 {mmlu_test_dir}/BuildTestMMLU.py --hf_model "$output_dir" --n_shot 0 --use_validation_set

## Using HF model tokenizer: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_goose/.hf_build/v7-1B5-world/
## Loading MMLU cached dataset (n_shot=0,tokenizer=world): /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/mmlu/.mmlu_cache/mmlu-val-t_world-n_0-p_0-c_16-r0.pth
## Done: Dataset has been built and cached


In [5]:
# Run the HF based MMLU tester, with the cuda kernel
# Batch size of 32, is for a 1B5 model, n_shot 0, with 24GB vram (ie. 4090)
!python3 {mmlu_test_dir}/RunTestMMLU.py "$output_dir" --batch_size 32 --n_shot 0 --use_validation_set --tmix_backend "cuda"

------------------------------------------------
## Loading HF model: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_goose/.hf_build/v7-1B5-world/
------------------------------------------------
## Preparing the dataset
## Loading MMLU cached dataset (n_shot=0,tokenizer=world): /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/mmlu/.mmlu_cache/mmlu-val-t_world-n_0-p_0-c_16-r0.pth
## Done: Dataset has been built and cached
------------------------------------------------
## Starting the MMLU test ...
### Running MMLU test : all (count=1531, batches=48) ...
Using /home/recursal/.cache/torch_extensions/py312_cu121 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/recursal/.cache/torch_extensions/py312_cu121/state_wind_backstepping/build.ninja...
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
Building extension module state_wind_backstepping...
Allowing ninja to set a default number

In [6]:
# Run the HF based MMLU tester, with the triton kernel
# Batch size of 32, is for a 1B5 model, n_shot 0, with 24GB vram (ie. 4090)
!python3 {mmlu_test_dir}/RunTestMMLU.py "$output_dir" --batch_size 32 --n_shot 0 --use_validation_set --tmix_backend "triton"

------------------------------------------------
## Loading HF model: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_goose/.hf_build/v7-1B5-world/
------------------------------------------------
## Preparing the dataset
## Loading MMLU cached dataset (n_shot=0,tokenizer=world): /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/mmlu/.mmlu_cache/mmlu-val-t_world-n_0-p_0-c_16-r0.pth
## Done: Dataset has been built and cached
------------------------------------------------
## Starting the MMLU test ...
### Running MMLU test : all (count=1531, batches=48) ...
#### all - accuracy=0.3063 , probability=0.2980
------------------------------------------------
### MMLU overall test result : accuracy=0.3063 , probability=0.2980
------------------------------------------------


In [7]:
# Run the HF based MMLU tester, with the triton kernel
# Batch size of 32, is for a 1B5 model, n_shot 0, with 24GB vram (ie. 4090)
!python3 {mmlu_test_dir}/RunTestMMLU.py "$output_dir" --batch_size 32 --n_shot 0 --use_validation_set --tmix_backend "fla"

------------------------------------------------
## Loading HF model: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_goose/.hf_build/v7-1B5-world/
------------------------------------------------
## Preparing the dataset
## Loading MMLU cached dataset (n_shot=0,tokenizer=world): /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/mmlu/.mmlu_cache/mmlu-val-t_world-n_0-p_0-c_16-r0.pth
## Done: Dataset has been built and cached
------------------------------------------------
## Starting the MMLU test ...
### Running MMLU test : all (count=1531, batches=48) ...
Traceback (most recent call last):
  File "/home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/mmlu/RunTestMMLU.py", line 170, in <module>
    main()
  File "/home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/mmlu/RunTestMMLU.py", line 164, in main
    mmlu_test_runner(
  File "/home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/mmlu/RunTestMMLU.py", line 73, in m

# MMLU testing 
**(this is not a substitute for lm-eval-harness : the score is counted differently)**

In [8]:
# Run the HF based MMLU tester, with the cuda kernel
# Batch size of 32, is for a 1B5 model, n_shot 0, with 24GB vram (ie. 4090)
!python3 {mmlu_test_dir}/RunTestMMLU.py "$output_dir" --batch_size 32 --n_shot 0 --tmix_backend "cuda"

------------------------------------------------
## Loading HF model: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_goose/.hf_build/v7-1B5-world/
------------------------------------------------
## Preparing the dataset
## Loading MMLU cached dataset (n_shot=0,tokenizer=world): /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/mmlu/.mmlu_cache/mmlu-test-t_world-n_0-p_0-c_16-r0.pth
## Done: Dataset has been built and cached
------------------------------------------------
## Starting the MMLU test ...
### Running MMLU test : abstract_algebra (count=100, batches=4) ...
Using /home/recursal/.cache/torch_extensions/py312_cu121 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/recursal/.cache/torch_extensions/py312_cu121/state_wind_backstepping/build.ninja...
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
Building extension module state_wind_backstepping...
Allowing ninja to set a de

In [9]:
# Run the HF based MMLU tester, with the triton kernel (modified)
# Batch size of 32, is for a 1B5 model, n_shot 0, with 24GB vram (ie. 4090)
!python3 {mmlu_test_dir}/RunTestMMLU.py "$output_dir" --batch_size 32 --n_shot 0 --tmix_backend "triton"

------------------------------------------------
## Loading HF model: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_goose/.hf_build/v7-1B5-world/
------------------------------------------------
## Preparing the dataset
## Loading MMLU cached dataset (n_shot=0,tokenizer=world): /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/mmlu/.mmlu_cache/mmlu-test-t_world-n_0-p_0-c_16-r0.pth
## Done: Dataset has been built and cached
------------------------------------------------
## Starting the MMLU test ...
### Running MMLU test : abstract_algebra (count=100, batches=4) ...
Traceback (most recent call last):
  File "/home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/mmlu/RunTestMMLU.py", line 170, in <module>
    main()
  File "/home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/mmlu/RunTestMMLU.py", line 164, in main
    mmlu_test_runner(
  File "/home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/mmlu/RunTestMMLU.py", l

In [10]:
# Run the HF based MMLU tester, with the triton kernel (modified)
# Batch size of 32, is for a 1B5 model, n_shot 0, with 24GB vram (ie. 4090)
!python3 {mmlu_test_dir}/RunTestMMLU.py "$output_dir" --batch_size 32 --n_shot 0 --tmix_backend "fla"

------------------------------------------------
## Loading HF model: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_goose/.hf_build/v7-1B5-world/
------------------------------------------------
## Preparing the dataset
## Loading MMLU cached dataset (n_shot=0,tokenizer=world): /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/mmlu/.mmlu_cache/mmlu-test-t_world-n_0-p_0-c_16-r0.pth
## Done: Dataset has been built and cached
------------------------------------------------
## Starting the MMLU test ...
### Running MMLU test : abstract_algebra (count=100, batches=4) ...
Traceback (most recent call last):
  File "/home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/mmlu/RunTestMMLU.py", line 170, in <module>
    main()
  File "/home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/mmlu/RunTestMMLU.py", line 164, in main
    mmlu_test_runner(
  File "/home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/mmlu/RunTestMMLU.py", l