In [46]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

english_tokenizer = AutoTokenizer.from_pretrained("roneneldan/TinyStories-2Layers-33M")
model = AutoModelForCausalLM.from_pretrained("roneneldan/TinyStories-2Layers-33M")



In [47]:
english_tokenizer

GPT2TokenizerFast(name_or_path='roneneldan/TinyStories-2Layers-33M', vocab_size=50257, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True)}, clean_up_tokenization_spaces=True)

In [48]:
# load a Chinese tokenizer
chinese_tokenizer = AutoTokenizer.from_pretrained("m-a-p/neo_7b", use_fast=False, trust_remote_code=True)
chinese_tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


NEOTokenizer(name_or_path='m-a-p/neo_7b', vocab_size=64000, model_max_length=4096, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'additional_special_tokens': ['<|CLS|>', '<|SEP|>', '<|EOD|>', '<|MASK|>', '<|PAD|>']}, clean_up_tokenization_spaces=False)

In [43]:
# vocab size is too big. Since we are only using this for in-context learning, we can remap them by scaling to the smaller vocab size
# while ensuring no overlap
# let's do this on the text

In [44]:
# This is generated by GPT4 with the prompt (roughly) "Generate 100 short Chinese sentences using only the most common words"

text = "他在中国。我们去学校。这个人很好。她在上课。他们有问题。我喜欢这里。他说我们。学生很多。这个家很好。天气很好。我们在这里。他是学生。我有很多书。她的名字很长。我们一起学习。他今天没来。这个地方很大。她是我的朋友。我们要去那里。他们喜欢读书。我有一个问题。她在家看电视。我们去上课。这个孩子很聪明。天气非常好。他的书很有趣。我们都在这里。他喜欢运动。她的朋友很多。他们在学习。他今天很忙。我们在一起。她的家很大。他喜欢看书。我们去吃饭。这个问题很难。天气很好，我们出去吧。他的朋友来了。我们在学校学习。她的中文很好。他们一起玩游戏。我喜欢这个地方。他是一个好老师。我们去商店买东西。这个人很有意思。天气不好，我们在家。他在看电视。我们要努力学习。她的书很有趣。我们在公园玩。他喜欢音乐。我们一起去旅行。这个孩子很可爱。天气很冷，我们穿多点。他的工作很忙。我们去电影院看电影。这个问题很简单。她在家看书。我们去学校上课。这个地方很美。天气很好，我们去散步。他的朋友很多。我们一起吃饭。她喜欢唱歌。他们在学习中文。我喜欢这个学校。他是一个好学生。我们去公园玩。这个人很聪明。天气很热，我们去游泳。他的书很有意思。我们在一起玩游戏。她喜欢画画。他们在图书馆看书。我喜欢这个城市。他是一个好朋友。我们去买东西。这个孩子很聪明。天气不好，我们在家。他的家很漂亮。他今天去上班。我们在图书馆学习。这个城市很大。她的朋友很好。他们喜欢运动。我喜欢看电影。他说他很忙。学生们在上课。这个家很温暖。天气冷了，多穿点。我们一起去旅行。他是我的老师。我有很多朋友。她喜欢听音乐。我们去海边玩。这个问题不难。天气热，我们喝水。他的家在山上。我们在咖啡馆见面。他在学校工作。我们去买书。这个孩子很调皮。天气不错，出去走走。他的家人很好。我们在看电影。她喜欢画画。他们在打篮球。我喜欢这个房子。他是一个好医生。我们去公园散步。这个人很幽默。天气很好，我们去野餐。他的猫很可爱。我们一起吃早餐。她喜欢写作。他们在学习英语。我喜欢这个老师。他是一个好学生。我们去商店买东西。这个孩子很聪明。天气不好，我们在家。他的朋友很有趣。我们去博物馆。她喜欢看电影。他们在练习跑步。我喜欢这个地方。他是一个好司机。我们去吃午饭。这个人很勤奋。天气热，我们去游泳。这个人很友好。天气很好，我们去公园。他的书包很重。我们在学校见面。他喜欢游泳。我喜欢这个音乐。她是一个好医生。我们去吃晚饭。这个地方很安静。"

In [71]:
class ChineseProcesser:
    def __init__(self, chinese_tokenizer, english_tokenizer):
        self.english_to_chinese = {}
        self.chinese_to_english = {}
        self.chinese_tokenizer = chinese_tokenizer
        self.english_tokenizer = english_tokenizer
    
    def encode(self, text:str):
        self.english_to_chinese = {}
        self.chinese_to_english = {}

        message = [text]
        inputs = self.chinese_tokenizer(message, return_tensors='pt', return_token_type_ids=False)
        print(inputs['input_ids'])
        input_ids = inputs['input_ids']
        remapped_input_ids = input_ids / chinese_tokenizer.vocab_size * english_tokenizer.vocab_size
        remapped_input_ids = remapped_input_ids.long()
        assert len(set(remapped_input_ids)) == len(set(input_ids)) # no collision
        for c, e in zip(remapped_input_ids[0], input_ids[0]):
            self.chinese_to_english[c.item()] = e.item()
            self.english_to_chinese[e.item()] = c.item()
        inputs['input_ids'] = remapped_input_ids
        return inputs

    def decode(self, response):
        ids = response[0]
        chinese_ids = []
        for i in ids:
            if i.item() in self.english_to_chinese.keys():
                chinese_ids.append(self.english_to_chinese[i.item()])
            else:
                chinese_ids.append(i / english_tokenizer.vocab_size * chinese_tokenizer.vocab_size)
                chinese_ids[-1] = chinese_ids[-1].long()
        unseen_tokens = sum([x not in self.chinese_to_english.keys() for x in chinese_ids])
        if unseen_tokens > 0:
            print(f"WARN: unseen tokens: {unseen_tokens}, total tokens: {len(chinese_ids)}")
        print(chinese_ids)
        return self.chinese_tokenizer.decode(chinese_ids)

chinese_processer = ChineseProcesser(chinese_tokenizer, english_tokenizer)

In [72]:
inputs = chinese_processer.encode(text)
print(inputs)

tensor([[16388, 10653, 56306,   865, 56473,  2396, 56306, 41051,  7126, 56306,
         30089, 21008, 56306,  1626, 30109, 56306, 30130,  3424, 56306, 10432,
           865, 56306,  1575,  2493, 56306,  1539, 56303,  7126, 56306,  9562,
          7126, 56306,   865, 10156, 56306, 19142,  1575, 56306, 56263, 14916,
         56588, 56306,  6867,  6553, 47250, 56306,   865,  3725,  2096, 56306,
         56327,  3698, 56429, 56290, 56306,  1539,  3529,  9670, 56306, 56662,
         33218,  2447, 56306, 19970, 56473, 10775, 56306,  1626,  2854, 13485,
         56306, 56263, 10072,  1234, 56306, 56662, 20302, 52347, 56306,   865,
         56473, 21008, 56306,  1539,  2677, 56426, 23267, 56306,  9562, 25013,
         56306,  3649, 56588, 11273, 57323, 56306,   865,  8998,  3424, 56306,
         56327,  2854,  3488, 56306,  6867,  2447,  2493, 56306, 21593,  2096,
         56306, 56327,  3698, 56426, 57661, 56306,   865,  9939, 56306,  6867,
         56303,  9670, 56306, 56327,  2854, 40918, 5

In [73]:
response = model.generate(**inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)
print(response)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[12868,  8365, 44215,   679, 44346,  1881, 44215, 32235,  5595, 44215,
         23627, 16496, 44215,  1276, 23643, 44215, 23660,  2688, 44215,  8191,
           679, 44215,  1236,  1957, 44215,  1208, 44212,  5595, 44215,  7508,
          5595, 44215,   679,  7975, 44215, 15031,  1236, 44215, 44181, 11713,
         44436, 44215,  5392,  5145, 37103, 44215,   679,  2925,  1645, 44215,
         44231,  2903, 44311, 44202, 44215,  1208,  2771,  7593, 44215, 44494,
         26084,  1921, 44215, 15681, 44346,  8461, 44215,  1276,  2241, 10589,
         44215, 44181,  7909,   969, 44215, 44494, 15942, 41106, 44215,   679,
         44346, 16496, 44215,  1208,  2102, 44309, 18270, 44215,  7508, 19641,
         44215,  2865, 44436,  8852, 45013, 44215,   679,  7065,  2688, 44215,
         44231,  2241,  2739, 44215,  5392,  1921,  1957, 44215, 16956,  1645,
         44215, 44231,  2903, 44309, 45279, 44215,   679,  7804, 44215,  5392,
         44212,  7593, 44215, 44231,  2241, 32131, 4

In [74]:
response_str = chinese_processer.decode(response)
print(response_str)

WARN: unseen tokens: 724, total tokens: 724
[tensor(16386), tensor(10652), tensor(56305), tensor(864), tensor(56472), tensor(2395), tensor(56305), tensor(41049), tensor(7124), tensor(56305), tensor(30087), tensor(21006), tensor(56305), tensor(1624), tensor(30108), tensor(56305), tensor(30129), tensor(3423), tensor(56305), tensor(10430), tensor(864), tensor(56305), tensor(1573), tensor(2492), tensor(56305), tensor(1538), tensor(56301), tensor(7124), tensor(56305), tensor(9561), tensor(7124), tensor(56305), tensor(864), tensor(10155), tensor(56305), tensor(19141), tensor(1573), tensor(56305), tensor(56262), tensor(14915), tensor(56587), tensor(56305), tensor(6866), tensor(6551), tensor(47248), tensor(56305), tensor(864), tensor(3724), tensor(2094), tensor(56305), tensor(56326), tensor(3696), tensor(56428), tensor(56289), tensor(56305), tensor(1538), tensor(3528), tensor(9669), tensor(56305), tensor(56661), tensor(33216), tensor(2446), tensor(56305), tensor(19969), tensor(56472), tensor(1

In [70]:
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True, revision="v1.1.0")



In [4]:
tokenizer

ChatGLMTokenizer(name_or_path='THUDM/chatglm-6b', vocab_size=130344, model_max_length=2048, is_fast=False, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<sop>', 'eos_token': '<eop>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [11]:
# Get the vocabulary
vocab = tokenizer.get_vocab()

# Sort the vocabulary by index
sorted_vocab = sorted(vocab.items(), key=lambda item: item[1])

# Get the first few vocabulary entries (e.g., first 10 entries)
first_few_vocab = sorted_vocab[:10]

# Print the first few vocabulary entries
for token, index in first_few_vocab:
    print(f"Token: {token}, Index: {index}")

Token: <unk>, Index: 0
Token: <s>, Index: 1
Token: </s>, Index: 2
Token: <pad>, Index: 3
Token: <n>, Index: 4
Token: ▁, Index: 5
Token: ,, Index: 6
Token: ., Index: 7
Token: 0, Index: 8
Token: 1, Index: 9


In [12]:
# Get the first few vocabulary entries (e.g., first 10 entries)
first_few_vocab = sorted_vocab[-10:]

# Print the first few vocabulary entries
for token, index in first_few_vocab:
    print(f"Token: {token}, Index: {index}")

Token: <0xF6>, Index: 130334
Token: <0xF7>, Index: 130335
Token: <0xF8>, Index: 130336
Token: <0xF9>, Index: 130337
Token: <0xFA>, Index: 130338
Token: <0xFB>, Index: 130339
Token: <0xFC>, Index: 130340
Token: <0xFD>, Index: 130341
Token: <0xFE>, Index: 130342
Token: <0xFF>, Index: 130343


In [15]:
# Get the first few vocabulary entries (e.g., first 10 entries)
first_few_vocab = sorted_vocab[700:]

# Print the first few vocabulary entries
for token, index in first_few_vocab:
    print(f"Token: {token}, Index: {index}")

Token: ▁added, Index: 700
Token: html, Index: 701
Token: us, Index: 702
Token: ▁love, Index: 703
Token: ▁State, Index: 704
Token: ▁array, Index: 705
Token: ▁though, Index: 706
Token: ▁private, Index: 707
Token: ▁base, Index: 708
Token: ▁head, Index: 709
Token: value, Index: 710
Token: ▁protein, Index: 711
Token: ▁away, Index: 712
Token: ▁evidence, Index: 713
Token: ▁become, Index: 714
Token: ▁didn, Index: 715
Token: ▁five, Index: 716
Token: ▁potential, Index: 717
Token: ▁space, Index: 718
Token: ▁across, Index: 719
Token: ▁On, Index: 720
Token: as, Index: 721
Token: for, Index: 722
Token: ▁significantly, Index: 723
Token: ▁major, Index: 724
Token: test, Index: 725
Token: add, Index: 726
Token: ▁seen, Index: 727
Token: J, Index: 728
Token: ▁Trump, Index: 729
Token: ▁al, Index: 730
Token: ▁view, Index: 731
Token: ▁via, Index: 732
Token: ▁X, Index: 733
Token: ▁general, Index: 734
Token: String, Index: 735
Token: ▁pre, Index: 736
Token: ▁old, Index: 737
Token: ▁cancer, Index: 738
Token: ▁t

In [16]:
bctokenizer = AutoTokenizer.from_pretrained("baichuan-inc/Baichuan-7B", trust_remote_code=True)
bctokenizer

A new version of the following files was downloaded from https://huggingface.co/baichuan-inc/Baichuan-7B:
- tokenization_baichuan.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


BaiChuanTokenizer(name_or_path='baichuan-inc/Baichuan-7B', vocab_size=64000, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True)}, clean_up_tokenization_spaces=False)

In [17]:
tokenizer = AutoTokenizer.from_pretrained("m-a-p/neo_7b")

ValueError: Tokenizer class NEOTokenizer does not exist or is not currently imported.

In [32]:
tokenizer = AutoTokenizer.from_pretrained("m-a-p/neo_7b", use_fast=False, trust_remote_code=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
from transformers import AutoTokenizer, PreTrainedTokenizerFast

# Load the original tokenizer
tokenizer = AutoTokenizer.from_pretrained("m-a-p/neo_7b", use_fast=False, trust_remote_code=True)

# Get the vocabulary
vocab = tokenizer.get_vocab()

# Sort the vocabulary by index
sorted_vocab = sorted(vocab.items(), key=lambda item: item[1])

# Trim the vocabulary to the desired size
trimmed_vocab = sorted_vocab[:50257]

# Create a new vocabulary dictionary
new_vocab = {token: index for index, (token, _) in enumerate(trimmed_vocab)}

# Create a new tokenizer with the trimmed vocabulary
new_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer, vocab=new_vocab)

# Save the new tokenizer
new_tokenizer.save_pretrained("chinese_tokenizer")

# Optionally, reload the tokenizer to ensure it works correctly
reloaded_tokenizer = PreTrainedTokenizerFast.from_pretrained("chinese_tokenizer")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


AttributeError: 'NEOTokenizer' object has no attribute 'no_truncation'

In [5]:
# Extract special tokens
metadata = {
    "unk_token": tokenizer.unk_token,
    "pad_token": tokenizer.pad_token,
    "bos_token": tokenizer.bos_token,
    "eos_token": tokenizer.eos_token,
    "additional_special_tokens": tokenizer.additional_special_tokens,
}

# Print the metadata
print(metadata)

{'unk_token': '<unk>', 'pad_token': '<unk>', 'bos_token': '<s>', 'eos_token': '</s>', 'additional_special_tokens': ['<|CLS|>', '<|SEP|>', '<|EOD|>', '<|MASK|>', '<|PAD|>']}


In [6]:
vocab

{'<unk>': 0,
 '<s>': 1,
 '</s>': 2,
 '<0x00>': 3,
 '<0x01>': 4,
 '<0x02>': 5,
 '<0x03>': 6,
 '<0x04>': 7,
 '<0x05>': 8,
 '<0x06>': 9,
 '<0x07>': 10,
 '<0x08>': 11,
 '<0x09>': 12,
 '<0x0A>': 13,
 '<0x0B>': 14,
 '<0x0C>': 15,
 '<0x0D>': 16,
 '<0x0E>': 17,
 '<0x0F>': 18,
 '<0x10>': 19,
 '<0x11>': 20,
 '<0x12>': 21,
 '<0x13>': 22,
 '<0x14>': 23,
 '<0x15>': 24,
 '<0x16>': 25,
 '<0x17>': 26,
 '<0x18>': 27,
 '<0x19>': 28,
 '<0x1A>': 29,
 '<0x1B>': 30,
 '<0x1C>': 31,
 '<0x1D>': 32,
 '<0x1E>': 33,
 '<0x1F>': 34,
 '<0x20>': 35,
 '<0x21>': 36,
 '<0x22>': 37,
 '<0x23>': 38,
 '<0x24>': 39,
 '<0x25>': 40,
 '<0x26>': 41,
 '<0x27>': 42,
 '<0x28>': 43,
 '<0x29>': 44,
 '<0x2A>': 45,
 '<0x2B>': 46,
 '<0x2C>': 47,
 '<0x2D>': 48,
 '<0x2E>': 49,
 '<0x2F>': 50,
 '<0x30>': 51,
 '<0x31>': 52,
 '<0x32>': 53,
 '<0x33>': 54,
 '<0x34>': 55,
 '<0x35>': 56,
 '<0x36>': 57,
 '<0x37>': 58,
 '<0x38>': 59,
 '<0x39>': 60,
 '<0x3A>': 61,
 '<0x3B>': 62,
 '<0x3C>': 63,
 '<0x3D>': 64,
 '<0x3E>': 65,
 '<0x3F>': 66,
 '<0x40>': 

In [8]:
import json

# Optionally, reload the tokenizer to en# Save vocabulary to a JSON file
with open("vocab.json", "w") as vocab_file:
    json.dump(vocab, vocab_file)

# Save tokenizer configuration to a JSON file
with open("tokenizer_config.json", "w") as config_file:
    json.dump(metadata, config_file)

In [9]:
from transformers import AutoTokenizer

# Load the tokenizer using the saved vocabulary and configuration
tokenizer = AutoTokenizer.from_pretrained(
    ".",  # You can also provide a directory path containing both files
    vocab_file="vocab.json",
    tokenizer_config_file="tokenizer_config.json"
)

# Test the tokenizer
print(tokenizer.tokenize("Hello world!"))


OSError: . does not appear to have a file named config.json. Checkout 'https://huggingface.co/./None' for available files.

In [10]:
len(vocab)

64005