# Hugging Face builder test

In [1]:
# Get the current script directory, from the notebook
import os
notebook_dir = os.getcwd()
print("Notebook directory:", notebook_dir)

model_filename = "qwrky7-7B"
model_file = os.path.join(notebook_dir, ".model", f"{model_filename}.pth")
print("Model file path:", model_file)

# Check if the model file exists
if os.path.isfile(model_file) is False:
    raise Exception("Model file does not exist")

# Get the project directory two levels up
project_dir = os.path.dirname(os.path.dirname(notebook_dir))
print("Project directory:", project_dir)

# Output build directory
output_dir = os.path.join(notebook_dir, f".hf_build/{model_filename}/")
print("Output directory:", output_dir)

Notebook directory: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_qwrky
Model file path: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_qwrky/.model/qwrky7-7B.pth
Project directory: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block
Output directory: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_qwrky/.hf_build/qwrky7-7B/


In [3]:
# Empty the output directory, if it exists
if os.path.isdir(output_dir):
    import shutil
    print("Removing existing output directory")
    shutil.rmtree(output_dir)
    
# Run the hf_builder.py
!python3 "$project_dir/hf_builder/hf_builder.py" --model_class "v7_qwrky" "$model_file" "$output_dir"

-----------------------------
Converting RWKV model to HuggingFace format...
Model Class     : v7_qwrky
Model Source    : /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_qwrky/.model/qwrky7-7B.pth
Tokenizer Type  : auto
Output Directory: /home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/test/v7_qwrky/.hf_build/qwrky7-7B/
-----------------------------
Building rwkv_block into HF code ...
Loading model weights raw state ...
Loading model config from weights ...
Traceback (most recent call last):
  File "/home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/hf_builder/hf_builder.py", line 379, in <module>
    main()
  File "/home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/hf_builder/hf_builder.py", line 376, in main
    hf_builder(args)
  File "/home/recursal/rwkv-prj/layerwise-trainer/block/RWKV_block/hf_builder/hf_builder.py", line 267, in hf_builder
    from hf_code.v7_qwrky.configuration_qwrky7 import Qwrky7Config
  File "/home/recursal/rwkv-

# Basic HELLO WORLD

In [None]:
# Load the built model, using the transformers library
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig

# Validating the config and tokenizer are built correctly
config = AutoConfig.from_pretrained(output_dir, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(output_dir, trust_remote_code=True)

# Move the model to the GPU
RUN_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# Build the model itself
model = AutoModelForCausalLM.from_pretrained(output_dir, trust_remote_code=True, tmix_backend="fla", device=RUN_DEVICE)
model.to(RUN_DEVICE)
print("Model and tokenizer loaded successfully")

# Print the device being used
print("Running on device:", RUN_DEVICE)

# Lets generate some text, using the model on the GPU
dragon_prompt = "\nIn a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese."
hellow_prompt = "HELLO WORLD"

print("---------------------------------")
print(f"Prompt: {hellow_prompt}")
inputs = tokenizer(hellow_prompt, return_tensors="pt").to(RUN_DEVICE)
outputs = model.generate(**inputs)
print("Generated text:", tokenizer.decode(outputs[0], skip_special_tokens=True))
print("---------------------------------")
print(f"Prompt: {dragon_prompt}")
inputs = tokenizer(dragon_prompt, return_tensors="pt").to(RUN_DEVICE)
outputs = model.generate(**inputs)
print("Generated text:", tokenizer.decode(outputs[0], skip_special_tokens=True))
print("---------------------------------")

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00,  5.18it/s]
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Model and tokenizer loaded successfully
Running on device: cuda
---------------------------------
Prompt: HELLO WORLD


OutOfResources: out of resource: shared memory, Required: 106496, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.

# MMLU validation testing (smaller set)
**(this is not a substitute for lm-eval-harness : the score is counted differently)**

In [None]:
# MMLU tester directory
mmlu_test_dir = os.path.join(project_dir, "test/mmlu")

# Run the test dataset builder, optional:  --use_validation_set
!python3 {mmlu_test_dir}/BuildTestMMLU.py --hf_model "$output_dir" --n_shot 0 --use_validation_set

In [None]:
# Run the HF based MMLU tester, with the triton kernel
!python3 {mmlu_test_dir}/RunTestMMLU.py "$output_dir" --batch_size 8 --n_shot 0 --use_validation_set --tmix_backend "triton"

In [None]:
# Run the HF based MMLU tester, with the triton kernel
!python3 {mmlu_test_dir}/RunTestMMLU.py "$output_dir" --batch_size 8 --n_shot 0 --use_validation_set --tmix_backend "triton_bighead"

In [None]:
# Run the HF based MMLU tester, with the triton kernel
!python3 {mmlu_test_dir}/RunTestMMLU.py "$output_dir" --batch_size 8 --n_shot 0 --use_validation_set --tmix_backend "fla"

In [None]:
# Run the HF based MMLU tester, with the triton kernel
!python3 {mmlu_test_dir}/RunTestMMLU.py "$output_dir" --batch_size 8 --n_shot 0 --use_validation_set --tmix_backend "fla_fused"

# MMLU testing 
**(this is not a substitute for lm-eval-harness : the score is counted differently)**

In [None]:
# Run the HF based MMLU tester, with the triton kernel (modified)
!python3 {mmlu_test_dir}/RunTestMMLU.py "$output_dir" --batch_size 8 --n_shot 0 --tmix_backend "triton"

In [None]:
# Run the HF based MMLU tester, with the triton kernel (modified)
!python3 {mmlu_test_dir}/RunTestMMLU.py "$output_dir" --batch_size 8 --n_shot 0 --tmix_backend "triton_bighead"

In [None]:
# Run the HF based MMLU tester, with the triton kernel (modified)
!python3 {mmlu_test_dir}/RunTestMMLU.py "$output_dir" --batch_size 8 --n_shot 0 --tmix_backend "fla"

In [None]:
# Run the HF based MMLU tester, with the triton kernel (modified)
!python3 {mmlu_test_dir}/RunTestMMLU.py "$output_dir" --batch_size 8 --n_shot 0 --tmix_backend "fla_fused"