# Constant

In [None]:
from pathlib import Path


ROOT_DIR = Path("vocab")
ROOT_DIR.mkdir(parents=True, exist_ok=True)

COMMON_VOCAB_DIR = ROOT_DIR / "common_english_words"
COMMON_VOCAB_DIR.mkdir(parents=True, exist_ok=True)

MODEL_LIST = [
    "01-ai/Yi-34B-Chat",
    "mistralai/Mistral-7B-v0.1",
    "mistralai/Mixtral-8x7B-Instruct-v0.1",
    "meta-llama/Llama-2-7b-chat-hf",
    "meta-llama/Llama-2-13b-chat-hf",
    "meta-llama/Llama-2-70b-chat-hf",
    "codellama/CodeLlama-34b-Instruct-hf",
    "lmsys/vicuna-13b-v1.5",
    "Nexusflow/Starling-LM-7B-beta",
]

# Load Model Vocabulary

In [5]:
import json
from pathlib import Path
from transformers import AutoTokenizer

# Ensure ROOT_DIR exists at the start
ROOT_DIR = Path("vocab")
ROOT_DIR.mkdir(parents=True, exist_ok=True)

MODEL_LIST = [
    "01-ai/Yi-34B-Chat",
    "mistralai/Mistral-7B-v0.1",
    "mistralai/Mixtral-8x7B-Instruct-v0.1",
    "meta-llama/Llama-2-7b-chat-hf",
    "meta-llama/Llama-2-13b-chat-hf",
    "meta-llama/Llama-2-70b-chat-hf",
    "codellama/CodeLlama-34b-Instruct-hf",
    "lmsys/vicuna-13b-v1.5",
    "Nexusflow/Starling-LM-7B-beta",
]


def generate_save_path(model_path: str, root_dir: Path) -> Path:
    """Generates the save path for a model's vocabulary based on the model path."""
    _, model_name = model_path.split("/")
    return root_dir / f"{model_name}.json"


def save_vocab(model_path: str, save_path: Path) -> None:
    """Saves the vocabulary of a given model if it doesn't already exist."""
    if save_path.exists():
        print(f"Vocab file {save_path} already exists. Skipping...")
        return

    try:
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        vocab = tokenizer.get_vocab()
        save_path.write_text(json.dumps(vocab, ensure_ascii=False, indent=4))
        print(f"Successfully saved vocab for '{model_path}' to {save_path}")
    except Exception as e:
        print(f"Failed to save vocab for '{model_path}': {e}")


def main() -> None:
    for model_path in MODEL_LIST:
        save_path = generate_save_path(model_path, ROOT_DIR)
        save_vocab(model_path, save_path)


if __name__ == "__main__":
    main()

Vocab file vocab/Yi-34B-Chat.json already exists. Skipping...
Vocab file vocab/Mistral-7B-v0.1.json already exists. Skipping...
Vocab file vocab/Mixtral-8x7B-Instruct-v0.1.json already exists. Skipping...
Vocab file vocab/Llama-2-7b-chat-hf.json already exists. Skipping...
Vocab file vocab/Llama-2-13b-chat-hf.json already exists. Skipping...
Vocab file vocab/Llama-2-70b-chat-hf.json already exists. Skipping...
Vocab file vocab/CodeLlama-34b-Instruct-hf.json already exists. Skipping...
Vocab file vocab/vicuna-13b-v1.5.json already exists. Skipping...
Vocab file vocab/Starling-LM-7B-beta.json already exists. Skipping...


# Load Vocab

In [22]:
import json
from pathlib import Path
from collections import OrderedDict


class Vocab(OrderedDict):
    def __init__(self, model_vocab_paths: dict[str, Path], common_vocab_dir: Path):
        super().__init__()
        self.load_vocabs(model_vocab_paths, common_vocab_dir)

    def load_vocabs(
        self, model_vocab_paths: dict[str, Path], common_vocab_dir: Path
    ) -> None:
        # Load model-specific vocabularies
        self.update(
            {
                model_name: self.load_vocab(path, is_json=True)
                for model_name, path in model_vocab_paths.items()
            }
        )

        # Dynamically load all common vocabularies from the specified directory
        common_vocab_paths = common_vocab_dir.glob("*.txt")
        self.update({path.stem: self.load_vocab(path) for path in common_vocab_paths})

    @staticmethod
    def load_vocab(path: Path, is_json: bool = False) -> set[str]:
        with path.open("r", encoding="utf-8") as file:
            if is_json:
                return set(json.load(file))
            return {line.strip() for line in file if line.strip()}


def create_model_vocab_paths(model_list: list[str], root_dir: Path) -> dict[str, Path]:
    return {
        Path(model).parts[-1]: root_dir / f"{Path(model).parts[-1]}.json"
        for model in model_list
    }


model_vocab_paths = create_model_vocab_paths(MODEL_LIST, ROOT_DIR)
vocab = Vocab(model_vocab_paths, COMMON_VOCAB_DIR)

odict_keys(['Yi-34B-Chat', 'Mistral-7B-v0.1', 'Mixtral-8x7B-Instruct-v0.1', 'Llama-2-7b-chat-hf', 'Llama-2-13b-chat-hf', 'Llama-2-70b-chat-hf', 'CodeLlama-34b-Instruct-hf', 'vicuna-13b-v1.5', 'Starling-LM-7B-beta', 'wiki_most_100_common_word_in_english', 'oxford_3000', 'oxford_5000'])