# Set Up

In [None]:
!huggingface-cli login
!pip install -qU huggingface_hub
!sudo apt install python3-venv -y
!pip install git+https://github.com/callummcdougall/eindex.git

import torch
import os
import sys
import yaml
from transformers import AutoModelForCausalLM, AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "CPU"

!export LANG=ja_JP.UTF-8

# Models

In [None]:
tokenizer = AutoTokenizer.from_pretrained("aloobun/Reyna-Mini-1.8B-v0.2", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("aloobun/Reyna-Mini-1.8B-v0.2", trust_remote_code=True)

## [Nejumi LLM Leaderboard Benchmark](https://wandb.ai/wandb-japan/llm-leaderboard/reports/Nejumi-LLM-Leaderboard-Evaluating-Japanese-Language-Proficiency--Vmlldzo2MzU3NzIy#features-of-the-nejumi-leaderboard-%F0%9F%90%80)

Check Japanese profiency of base model

You'll need to update `llm-leaderboard/configs/config.yaml` with the following information:
```yaml
wandb:
  entity: "your/WANDB/entity"
  project: "your/WANDB/project"
  run_name: "your/WANDB/run_name"

...

model:
  pretrained_model_name_or_path: 'name of your model'

...

tokenizer:
  pretrained_model_name_or_path: 'name of your tokenizer'

...

metainfo:
  basemodel_name: "your modelname"
  model_type: "" # {open llm, commercial api}
```



In [None]:
!git clone -q https://github.com/wandb/llm-leaderboard.git
!cd llm-leaderboard && git submodule init && git submodule update --remote
!python3 -m venv llm-leaderboard/llmjp
!source llm-leaderboard/llmjp/bin/activate && pip install -q -r llm-leaderboard/requirements.txt
!cp llm-leaderboard/configs/config_template.yaml llm-leaderboard/configs/config.yaml

In [None]:
!cd llm-leaderboard && source llmjp/bin/activate && python3 scripts/run_eval.py

# Sparse Autoencoder

In [None]:
%pip install -q transformer_lens sae-lens wandb accelerate

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB__SERVICE_WAIT"] = "300"

from sae_lens.training.config import LanguageModelSAERunnerConfig
from sae_lens.training.lm_runner import language_model_sae_runner

In [None]:
# https://jbloomaus.github.io/SAELens/training_saes/
cfg = LanguageModelSAERunnerConfig(

    # Data Generating Function (Model + Training Distibuion)
    model_name = "qwen1.5-0.5b",
    hook_point = "blocks.18.hook_resid_pre",
    hook_point_layer = 18,
    d_in = 1024, # https://neelnanda-io.github.io/TransformerLens/generated/model_properties_table.html
    dataset_path = "Skylion007/openwebtext",
    is_dataset_tokenized=False,

    # SAE Parameters
    expansion_factor = 64,
    b_dec_init_method = "geometric_median",

    # Training Parameters
    lr = 0.0004,
    l1_coefficient = 0.00008,
    lr_scheduler_name="constant", # constantwithwarmup not supported?
    train_batch_size = 4096,
    context_size = 128,
    lr_warm_up_steps=5000,

    # Activation Store Parameters
    n_batches_in_buffer = 128,
    total_training_tokens = 1_000_000 * 100,
    store_batch_size = 32,

    # Dead Neurons and Sparsity
    use_ghost_grads=True,
    feature_sampling_window = 1000,
    dead_feature_window=5000,
    dead_feature_threshold = 1e-6,

    # WANDB
    log_to_wandb = True,
    wandb_project= "mats_sae_training_qwen",
    wandb_entity = None,
    wandb_log_frequency=100,

    # Misc
    device = "cuda",
    seed = 42,
    n_checkpoints = 10,
    checkpoint_path = "checkpoints",
    dtype = torch.float32,
    )

sparse_autoencoder = language_model_sae_runner(cfg)


## Upload to HuggingFace


In [None]:
from huggingface_hub import HfApi

api = HfApi()

uuid_str = "pqs59n3e"
repo_id = "kcoopermiller/qwen1.5-0.5b-saes"
local_folder = f"checkpoints/{uuid_str}"
hf_folder = f"{uuid_str}"
api.upload_folder(
    folder_path=local_folder,
    path_in_repo=hf_folder,
    repo_id=repo_id,
    repo_type="model",
)

## Evaluating the SAEs

In [None]:
import json
import plotly.express as px
from transformer_lens import utils
from datasets import load_dataset
from typing import Dict
from pathlib import Path
from huggingface_hub import hf_hub_download
from functools import partial
from sae_lens.training.session_loader import LMSparseAutoencoderSessionloader
from sae_vis.data_fetching_fns import get_feature_data, FeatureData
from sae_vis.data_config_classes import SaeVisConfig
torch.set_grad_enabled(False)
sys.path.append("..")

In [None]:
REPO_ID = "kcoopermiller/qwen1.5-0.5b-saes"

layer = 2 # 2 or 18
checkpoint = "crujwafo" if layer == 2 else "pqs59n3e"
FILENAME = f"{checkpoint}/final_sae_group_qwen1.5-0.5b_blocks.{layer}.hook_resid_pre_65536.pt"

path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)

model, sparse_autoencoders, activation_store = (
    LMSparseAutoencoderSessionloader.load_session_from_pretrained(path=path)
)
sparse_autoencoders.eval()
sparse_autoencoder = list(sparse_autoencoders)[0]

### L0 Test and Reconstruction Test

In [None]:
sparse_autoencoder.eval()  # prevents error if we're expecting a dead neuron mask for who grads
with torch.no_grad():
    batch_tokens = activation_store.get_batch_tokens()
    _, cache = model.run_with_cache(batch_tokens, prepend_bos=True)
    sae_out, feature_acts, loss, mse_loss, l1_loss, _ = sparse_autoencoder(
        cache[sparse_autoencoder.cfg.hook_point]
    )
    del cache

    # ignore the bos token, get the number of features that activated in each token, averaged accross batch and position
    l0 = (feature_acts[:, 1:] > 0).float().sum(-1).detach()
    print("average l0", l0.mean().item())
    px.histogram(l0.flatten().cpu().numpy()).show()

# next we want to do a reconstruction test.
def reconstr_hook(activation, hook, sae_out):
    return sae_out


def zero_abl_hook(activation, hook):
    return torch.zeros_like(activation)


print("Orig", model(batch_tokens, return_type="loss").item())
print(
    "reconstr",
    model.run_with_hooks(
        batch_tokens,
        fwd_hooks=[
            (
                utils.get_act_name("resid_pre", 10),
                partial(reconstr_hook, sae_out=sae_out),
            )
        ],
        return_type="loss",
    ).item(),
)
print(
    "Zero",
    model.run_with_hooks(
        batch_tokens,
        return_type="loss",
        fwd_hooks=[(utils.get_act_name("resid_pre", 10), zero_abl_hook)],
    ).item(),
)

### Specific Capability Test

In [None]:
example_prompt = "文章からキーワードを選び、その解釈を述べなさい。 新型コロナウイルスが広がらないようにするため、政府は建物の中で近くに人がいるときや話すときは、マスクをしたほうがいいと言っていました。3月13日からはこの考え方を変えて、マスクをするかどうか自分で決めるようにすると言いました。 しかし、病院やお年寄りの施設などに行くときや、混んでいる電車やバスに乗るときは、マスクをしたほうがいいと言っています。 学校の中では、4月1日からマスクをするように言わないことにします。その前でも卒業式ではマスクをしなくてもいいと言っています。 政府は、会社や学校などで困らないように、どんなときにマスクが必要かをしっかり伝えると言っています。"
example_answer = "この文章に含まれるキーワードとその解釈は以下です。"
utils.test_prompt(example_prompt, example_answer, model, prepend_bos=True)

logits, cache = model.run_with_cache(example_prompt, prepend_bos=True)
tokens = model.to_tokens(example_prompt)
sae_out, feature_acts, loss, mse_loss, l1_loss, _ = sparse_autoencoder(
    cache[sparse_autoencoder.cfg.hook_point]
)

### Generating Feature Interfaces


In [None]:
# Create custom Japanese dataset
from datasets import load_dataset

dataset = load_dataset("CohereForAI/aya_dataset")

dataset = dataset.filter(lambda example: example['language'] == 'Japanese')
dataset = dataset.rename_column("inputs", "text")

dataset.push_to_hub("cohere-jp")

In [None]:
vals, inds = torch.topk(feature_acts[0, -1].detach().cpu(), 10)
px.bar(x=[str(i) for i in inds], y=vals).show()

vocab_dict = model.tokenizer.vocab
vocab_dict = {
    v: k.replace("Ġ", " ").replace("\n", "\\n") for k, v in vocab_dict.items()
}

vocab_dict_filepath = Path(os.getcwd()) / "vocab_dict.json"
if not vocab_dict_filepath.exists():
    with open(vocab_dict_filepath, "w") as f:
        json.dump(vocab_dict, f)


os.environ["TOKENIZERS_PARALLELISM"] = "false"
data = load_dataset(
    "kcoopermiller/cohere-jp", split="train"
)  # currently use this dataset to avoid deal with tokenization while streaming
tokenized_data = utils.tokenize_and_concatenate(data, model.tokenizer, max_length=128)
tokenized_data = tokenized_data.shuffle(42)
all_tokens = tokenized_data["tokens"]


# Currently, don't think much more time can be squeezed out of it. Maybe the best saving would be to
# make the entire sequence indexing parallelized, but that's possibly not worth it right now.

total_batch_size = 4096 * 5
feature_idx = list(inds.flatten().cpu().numpy())
# max_batch_size = 512
# total_batch_size = 16384
# feature_idx = list(range(1000))


feature_vis_params = SaeVisConfig(
    hook_point=sparse_autoencoder.cfg.hook_point,
    minibatch_size_features=256,
    minibatch_size_tokens=64,
    features=feature_idx,
    verbose=True
)



tokens = all_tokens[:total_batch_size]

feature_data: Dict[int, FeatureData] = get_feature_data(
    encoder=sparse_autoencoder,
    model=model,
    tokens=tokens,
    cfg=feature_vis_params
)

feature_data.model = model

for test_idx in list(inds.flatten().cpu().numpy()):
    feature_data.save_feature_centric_vis(
        f"data_{test_idx:04}.html",
        feature_idx=test_idx
    )

# Merging

In [None]:
!git clone -q https://github.com/cg123/mergekit.git
!python3 -m venv mergekit/env
!source mergekit/env/bin/activate && pip install -q -e ./mergekit

## Frankenmerge / Passthrough
*concatenate layers from both models*

You can also use [MergeKit GUI Space](https://huggingface.co/spaces/arcee-ai/mergekit-gui) to merge


In [None]:
MODEL_NAME = "MATS-frankenmerge"
yaml_config = """
slices:
  - sources:
    - model: llm-jp/llm-jp-1.3b-v1.0
      layer_range: [0, 24]
  - sources:
    - model: Qwen/Qwen1.5-0.5B
      layer_range: [2]
merge_method: passthrough
dtype: bfloat16
"""

# Save config as yaml file
with open('merge.yaml', 'w', encoding="utf-8") as f:
    f.write(yaml_config)


In [None]:
!cd mergekit && source env/bin/activate && mergekit-yaml ../merge.yaml merge --copy-tokenizer --allow-crimes --out-shard-size 1B --lazy-unpickle

In [None]:
from huggingface_hub import ModelCard, ModelCardData, HfApi
from jinja2 import Template
from google.colab import userdata

username = "kcoopermiller"

template_text = """
---
license: apache-2.0
tags:
- merge
- mergekit
- lazymergekit
{%- for model in models %}
- {{ model }}
{%- endfor %}
---

# {{ model_name }}

{{ model_name }} is a merge of the following models using [mergekit](https://github.com/cg123/mergekit):

{%- for model in models %}
* [{{ model }}](https://huggingface.co/{{ model }})
{%- endfor %}

## 🧩 Configuration

\```yaml
{{- yaml_config -}}
\```
"""

# Create a Jinja template object
jinja_template = Template(template_text.strip())

# Get list of models from config
data = yaml.safe_load(yaml_config)
if "models" in data:
    models = [data["models"][i]["model"] for i in range(len(data["models"])) if "parameters" in data["models"][i]]
elif "parameters" in data:
    models = [data["slices"][0]["sources"][i]["model"] for i in range(len(data["slices"][0]["sources"]))]
elif "slices" in data:
    models = [data["slices"][i]["sources"][0]["model"] for i in range(len(data["slices"]))]
else:
    raise Exception("No models or slices found in yaml config")

# Fill the template
content = jinja_template.render(
    model_name=MODEL_NAME,
    models=models,
    yaml_config=yaml_config,
    username=username,
)

# Save the model card
card = ModelCard(content)
card.save('merge/README.md')

# Defined in the secrets tab in Google Colab
api = HfApi(token=userdata.get("HF_TOKEN"))

api.create_repo(
    repo_id=f"{username}/{MODEL_NAME}",
    repo_type="model"
)
api.upload_folder(
    repo_id=f"{username}/{MODEL_NAME}",
    folder_path="merge",
)