From 8d5580a10cf3647a4f2a2a0b1003d43894c51e78 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Fri, 15 Mar 2024 00:38:57 +0200 Subject: [PATCH 01/17] Allow custom kwargs in `tokenizer.apply_chat_template` --- src/tokenizers.js | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/tokenizers.js b/src/tokenizers.js index 9692cf3b0..b8fd24350 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -3005,6 +3005,7 @@ export class PreTrainedTokenizer extends Callable { truncation = false, max_length = null, return_tensor = true, + ...kwargs } = {}) { chat_template ??= this.chat_template ?? this.default_chat_template; @@ -3029,6 +3030,7 @@ export class PreTrainedTokenizer extends Callable { add_generation_prompt: add_generation_prompt, ...special_tokens_map, + ...kwargs, }); if (tokenize) { From 8d617b2ad13aa093298a5f9d2227e15df4d9b878 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Fri, 15 Mar 2024 03:49:59 +0200 Subject: [PATCH 02/17] Update jinja dependency version --- package-lock.json | 8 ++++---- package.json | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/package-lock.json b/package-lock.json index e3a171abe..61007e0e9 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,7 +9,7 @@ "version": "2.16.0", "license": "Apache-2.0", "dependencies": { - "@huggingface/jinja": "^0.2.1", + "@huggingface/jinja": "^0.2.2", "onnxruntime-web": "1.14.0", "sharp": "^0.32.0" }, @@ -745,9 +745,9 @@ } }, "node_modules/@huggingface/jinja": { - "version": "0.2.1", - "resolved": "https://registry.npmjs.org/@huggingface/jinja/-/jinja-0.2.1.tgz", - "integrity": "sha512-HxjVCll8oGfgUQmN91NYWCjfuaQ5mYZkc/BB1gjfp28q3s48yiB5jUEV7BvaRdIAb/+14cNdX8TIdalFykwywA==", + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/@huggingface/jinja/-/jinja-0.2.2.tgz", + "integrity": "sha512-/KPde26khDUIPkTGU82jdtTW9UAuvUTumCAbFs/7giR0SxsvZC4hru51PBvpijH6BVkHcROcvZM/lpy5h1jRRA==", "engines": { "node": ">=18" } diff --git a/package.json b/package.json index 446fd5f16..4f69df2c1 100644 --- a/package.json +++ b/package.json @@ -40,7 +40,7 @@ "dependencies": { "onnxruntime-web": "1.14.0", "sharp": "^0.32.0", - "@huggingface/jinja": "^0.2.1" + "@huggingface/jinja": "^0.2.2" }, "optionalDependencies": { "onnxruntime-node": "1.14.0" From 4524fe10a4e51070a88144e56100ee61f7eaa89b Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Fri, 15 Mar 2024 03:55:11 +0200 Subject: [PATCH 03/17] Add `tokenizer_kwargs` options --- src/tokenizers.js | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/tokenizers.js b/src/tokenizers.js index b8fd24350..29166d1c3 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -2995,6 +2995,7 @@ export class PreTrainedTokenizer extends Callable { * @param {number} [options.max_length=null] Maximum length (in tokens) to use for padding or truncation. Has no effect if tokenize is false. * If not specified, the tokenizer's `max_length` attribute will be used as a default. * @param {boolean} [options.return_tensor=true] Whether to return the output as a Tensor or an Array. Has no effect if tokenize is false. + * @param {Object} [options.tokenizer_kwargs={}] Additional options to pass to the tokenizer. * @returns {string | Tensor | number[]| number[][]} The tokenized output. */ apply_chat_template(conversation, { @@ -3005,6 +3006,7 @@ export class PreTrainedTokenizer extends Callable { truncation = false, max_length = null, return_tensor = true, + tokenizer_kwargs = {}, ...kwargs } = {}) { @@ -3040,6 +3042,7 @@ export class PreTrainedTokenizer extends Callable { truncation, max_length, return_tensor, + ...tokenizer_kwargs, }).input_ids; } From d10141f3d61c270883c620df9a634b23caa65378 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Fri, 15 Mar 2024 04:47:28 +0200 Subject: [PATCH 04/17] Add support for dictionaries of chat templates in the tokenizer config --- src/tokenizers.js | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/src/tokenizers.js b/src/tokenizers.js index 29166d1c3..12f9ca913 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -2519,6 +2519,15 @@ export class PreTrainedTokenizer extends Callable { this.legacy = false; this.chat_template = tokenizerConfig.chat_template ?? null; + if (Array.isArray(this.chat_template)) { + // Chat templates are stored as lists of dicts with fixed key names, + // we reconstruct that into a single dict while loading them. + const chat_template = {}; + for (const item of this.chat_template) { + chat_template[item['name']] = item['template']; + } + this.chat_template = chat_template; + } this._compiled_template_cache = new Map(); } @@ -3010,7 +3019,30 @@ export class PreTrainedTokenizer extends Callable { ...kwargs } = {}) { - chat_template ??= this.chat_template ?? this.default_chat_template; + // First, handle the cases when the model has a dict of multiple templates + if ( + (this.chat_template && typeof this.chat_template === 'object') || + (this.chat_template === null && this.default_chat_template && typeof this.default_chat_template === 'object') + ) { + const template_dict = this.chat_template ?? this.default_chat_template; + + if (chat_template !== null && chat_template in template_dict) { + // The user can pass the name of a template to the chat template argument instead of an entire template + chat_template = template_dict[chat_template]; + } else if (chat_template === null && "default" in template_dict) { + chat_template = template_dict["default"]; + } else if (chat_template === null) { + throw Error( + `This model has multiple chat templates with no default specified! Please either pass a chat ` + + `template or the name of the template you wish to use to the 'chat_template' argument. Available ` + + `template names are ${Object.keys(template_dict).sort()}.` + ) + } + } else if (chat_template === null) { + // These are the cases when the model has a single template + // priority: `chat_template` argument > `tokenizer.chat_template` > `tokenizer.default_chat_template + chat_template = this.chat_template ?? this.default_chat_template; + } // Compilation function uses a cache to avoid recompiling the same template let compiledTemplate = this._compiled_template_cache.get(chat_template); From 6f5bc6d3d842c59d6e87b751e5b9ba189c6a41e2 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Fri, 15 Mar 2024 04:51:58 +0200 Subject: [PATCH 05/17] Add `CohereTokenizer` --- src/tokenizers.js | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/tokenizers.js b/src/tokenizers.js index 12f9ca913..f2f905a31 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -4300,6 +4300,9 @@ export class VitsTokenizer extends PreTrainedTokenizer { this.decoder = new VitsDecoder({}); } } + +export class CohereTokenizer extends PreTrainedTokenizer { } + /** * Helper class which is used to instantiate pretrained tokenizers with the `from_pretrained` function. * The chosen tokenizer class is determined by the type specified in the tokenizer config. @@ -4351,6 +4354,7 @@ export class AutoTokenizer { VitsTokenizer, Qwen2Tokenizer, GemmaTokenizer, + CohereTokenizer, // Base case: PreTrainedTokenizer, From d366121a5efb347b4ea3ac76e5b3ba33c9e37b03 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Fri, 15 Mar 2024 17:04:46 +0200 Subject: [PATCH 06/17] `apply_chat_template` is no longer async --- tests/tokenizers.test.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/tokenizers.test.js b/tests/tokenizers.test.js index 40fed05d1..ee00ce663 100644 --- a/tests/tokenizers.test.js +++ b/tests/tokenizers.test.js @@ -395,7 +395,7 @@ describe('Chat templates', () => { .replaceAll('USE_DEFAULT_PROMPT', true) .replaceAll('DEFAULT_SYSTEM_MESSAGE', 'You are a helpful, respectful and honest assistant.'); - const text = await tokenizer.apply_chat_template(chat, { tokenize: false, return_tensor: false, chat_template }); + const text = tokenizer.apply_chat_template(chat, { tokenize: false, return_tensor: false, chat_template }); expect(text).toEqual("[INST] <>\nYou are a helpful, respectful and honest assistant.\n<>\n\nHello, how are you? [/INST] I'm doing great. How can I help you today? [INST] I'd like to show off how chat templating works! [/INST]"); @@ -412,7 +412,7 @@ describe('Chat templates', () => { for (let { messages, add_generation_prompt, tokenize, target } of tests) { - const generated = await tokenizer.apply_chat_template(messages, { + const generated = tokenizer.apply_chat_template(messages, { tokenize, add_generation_prompt, return_tensor: false, From ae794200cc4fff7ba6fd434835ce4ae0098f3a03 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Fri, 15 Mar 2024 17:18:53 +0200 Subject: [PATCH 07/17] Add unit test for multiple chat templates --- tests/tokenizers.test.js | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tests/tokenizers.test.js b/tests/tokenizers.test.js index ee00ce663..8b92c6702 100644 --- a/tests/tokenizers.test.js +++ b/tests/tokenizers.test.js @@ -350,6 +350,42 @@ describe('Chat templates', () => { compare(input_ids, [1, 733, 16289, 28793, 22557, 28725, 910, 460, 368, 28804, 733, 28748, 16289, 28793, 28737, 28742, 28719, 2548, 1598, 28723, 1602, 541, 315, 1316, 368, 3154, 28804, 2, 28705, 733, 16289, 28793, 315, 28742, 28715, 737, 298, 1347, 805, 910, 10706, 5752, 1077, 3791, 28808, 733, 28748, 16289, 28793]) }); + it('should support multiple chat templates', async () => { + + const tokenizer = await AutoTokenizer.from_pretrained("Xenova/c4ai-command-r-v01-tokenizer") + + // define conversation input: + const conversation = [ + { role: "user", content: "Whats the biggest penguin in the world?" } + ] + // define documents to ground on: + const documents = [ + { title: "Tall penguins", text: "Emperor penguins are the tallest growing up to 122 cm in height." }, + { title: "Penguin habitats", text: "Emperor penguins only live in Antarctica." } + ] + + // render the RAG prompt as a string: + const grounded_generation_prompt = tokenizer.apply_chat_template( + conversation, + { + chat_template: "rag", + tokenize: false, + add_generation_prompt: true, + + documents, + citation_mode: "accurate", // or "fast" + } + ) + expect(grounded_generation_prompt).toEqual( + "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># Safety Preamble\nThe instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral.\n\n" + + "# System Preamble\n## Basic Rules\nYou are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions.\n\n" + + "# User Preamble\n## Task and Context\nYou help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging.\n\n## Style Guide\nUnless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling.<|END_OF_TURN_TOKEN|>" + + "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>Whats the biggest penguin in the world?<|END_OF_TURN_TOKEN|>" + + "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>\nDocument: 0\ntitle: Tall penguins\ntext: Emperor penguins are the tallest growing up to 122 cm in height.\n\nDocument: 1\ntitle: Penguin habitats\ntext: Emperor penguins only live in Antarctica.\n<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>Carefully perform the following instructions, in order, starting each with a new line.\nFirstly, Decide which of the retrieved documents are relevant to the user's last input by writing 'Relevant Documents:' followed by comma-separated list of document numbers. If none are relevant, you should instead write 'None'.\nSecondly, Decide which of the retrieved documents contain facts that should be cited in a good answer to the user's last input by writing 'Cited Documents:' followed a comma-separated list of document numbers. If you dont want to cite any of them, you should instead write 'None'.\nThirdly, Write 'Answer:' followed by a response to the user's last input in high quality natural english. Use the retrieved documents to help you. Do not insert any citations or grounding markup.\nFinally, Write 'Grounded answer:' followed by a response to the user's last input in high quality natural english. Use the symbols and to indicate when a fact comes from a document in the search result, e.g my fact for a fact from document 0.<|END_OF_TURN_TOKEN|>" + + "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" + ); + }); + it('should support user-defined chat template', async () => { const tokenizer = await AutoTokenizer.from_pretrained("Xenova/llama-tokenizer"); From 1ec55b3b035051fdaf302530fb24261a673d6ba7 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Sat, 16 Mar 2024 14:56:26 +0200 Subject: [PATCH 08/17] Update tokenizers.js --- src/tokenizers.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tokenizers.js b/src/tokenizers.js index f2f905a31..edeebf021 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -2522,7 +2522,7 @@ export class PreTrainedTokenizer extends Callable { if (Array.isArray(this.chat_template)) { // Chat templates are stored as lists of dicts with fixed key names, // we reconstruct that into a single dict while loading them. - const chat_template = {}; + const chat_template = Object.create(null); for (const item of this.chat_template) { chat_template[item['name']] = item['template']; } From 926fbf56fef992d3a48d70c6015a9feb250e3495 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Sat, 16 Mar 2024 14:59:33 +0200 Subject: [PATCH 09/17] Also update when `chat_template` is undefined --- src/tokenizers.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tokenizers.js b/src/tokenizers.js index edeebf021..2a456e4ff 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -3038,10 +3038,10 @@ export class PreTrainedTokenizer extends Callable { `template names are ${Object.keys(template_dict).sort()}.` ) } - } else if (chat_template === null) { + } else { // These are the cases when the model has a single template // priority: `chat_template` argument > `tokenizer.chat_template` > `tokenizer.default_chat_template - chat_template = this.chat_template ?? this.default_chat_template; + chat_template ??= this.chat_template ?? this.default_chat_template; } // Compilation function uses a cache to avoid recompiling the same template From 761dba9e11f8dd52d5fe41e1ae92c7ed926d0d2a Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Sat, 16 Mar 2024 16:22:11 +0200 Subject: [PATCH 10/17] Support setting tokenizer and text from URL --- examples/tokenizer-playground/src/App.jsx | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/examples/tokenizer-playground/src/App.jsx b/examples/tokenizer-playground/src/App.jsx index 9105e0f20..0a727c4b4 100644 --- a/examples/tokenizer-playground/src/App.jsx +++ b/examples/tokenizer-playground/src/App.jsx @@ -4,12 +4,16 @@ import { Token } from './components/Token' function App() { + // Allow user to set tokenizer and text via URL query parameters + const urlParams = new URLSearchParams(window.location.search); + const tokenizerParam = urlParams.get('tokenizer'); + const textParam = urlParams.get('text'); const [tokenIds, setTokenIds] = useState([]) const [decodedTokens, setDecodedTokens] = useState([]) const [margins, setMargins] = useState([]) const [outputOption, setOutputOption] = useState('text'); - const [tokenizer, setTokenizer] = useState('Xenova/gpt-4'); + const [tokenizer, setTokenizer] = useState(tokenizerParam ?? 'Xenova/gpt-4'); const textareaRef = useRef(null); const outputRef = useRef(null); @@ -51,6 +55,12 @@ function App() { worker.current.postMessage({ model_id, text }); }, [tokenizer]); + useEffect(() => { + if (textParam) { + onInputChange({ target: { value: textParam } }); + } + }, [onInputChange, textParam]); + const onTokenizerChange = useCallback((e) => { const model_id = e.target.value; setTokenizer(model_id); @@ -86,6 +96,7 @@ function App() { rows="8" className="font-mono text-lg block w-full p-2.5 text-gray-900 bg-gray-50 rounded-lg border border-gray-200" placeholder="Enter some text" + defaultValue={textParam ?? textareaRef.current?.value ?? ''} >
From c1b3b858157738155d4b2b0d4b6c3c77b1d61024 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Sat, 16 Mar 2024 16:22:24 +0200 Subject: [PATCH 11/17] Update Claude tokenizer display name --- examples/tokenizer-playground/src/App.jsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tokenizer-playground/src/App.jsx b/examples/tokenizer-playground/src/App.jsx index 0a727c4b4..adc532d2e 100644 --- a/examples/tokenizer-playground/src/App.jsx +++ b/examples/tokenizer-playground/src/App.jsx @@ -80,7 +80,7 @@ function App() { - + From 265769a76922cef73c5816457db335c23302e174 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Sat, 16 Mar 2024 16:23:53 +0200 Subject: [PATCH 12/17] Add Cohere Command-R tokenizer to playground --- examples/tokenizer-playground/src/App.jsx | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/tokenizer-playground/src/App.jsx b/examples/tokenizer-playground/src/App.jsx index adc532d2e..7097231f3 100644 --- a/examples/tokenizer-playground/src/App.jsx +++ b/examples/tokenizer-playground/src/App.jsx @@ -84,6 +84,7 @@ function App() { + From f54ea64f4096933a7d92fd5f62f981f4acd787b7 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Mon, 18 Mar 2024 02:57:25 +0200 Subject: [PATCH 13/17] Add `Grok1Tokenizer` --- examples/tokenizer-playground/src/App.jsx | 1 + examples/tokenizer-playground/src/worker.js | 1 + src/tokenizers.js | 3 +++ 3 files changed, 5 insertions(+) diff --git a/examples/tokenizer-playground/src/App.jsx b/examples/tokenizer-playground/src/App.jsx index 7097231f3..98173f8fb 100644 --- a/examples/tokenizer-playground/src/App.jsx +++ b/examples/tokenizer-playground/src/App.jsx @@ -80,6 +80,7 @@ function App() { + diff --git a/examples/tokenizer-playground/src/worker.js b/examples/tokenizer-playground/src/worker.js index e3739e572..4db09bdc0 100644 --- a/examples/tokenizer-playground/src/worker.js +++ b/examples/tokenizer-playground/src/worker.js @@ -22,6 +22,7 @@ self.addEventListener('message', async (event) => { // NOTE: We just remove the StripDecoder from the llama tokenizer switch (tokenizer.constructor.name) { case 'LlamaTokenizer': + case 'Grok1Tokenizer': // tokenizer.decoder.decoders.at(-1).constructor.name === 'StripDecoder' tokenizer.decoder.decoders.pop(); break; diff --git a/src/tokenizers.js b/src/tokenizers.js index 2a456e4ff..3194d974d 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -3245,6 +3245,8 @@ export class GemmaTokenizer extends PreTrainedTokenizer { _default_chat_template = "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}" } +export class Grok1Tokenizer extends PreTrainedTokenizer { } + /** * Helper function to build translation inputs for an `NllbTokenizer` or `M2M100Tokenizer`. * @param {PreTrainedTokenizer} self The tokenizer instance. @@ -4354,6 +4356,7 @@ export class AutoTokenizer { VitsTokenizer, Qwen2Tokenizer, GemmaTokenizer, + Grok1Tokenizer, CohereTokenizer, // Base case: From a65ecfaa6c32c7a01d8c48496363d8b1b754d7d5 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Wed, 20 Mar 2024 14:48:26 +0200 Subject: [PATCH 14/17] Throw error if chat template object is malformed --- src/tokenizers.js | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/tokenizers.js b/src/tokenizers.js index 3194d974d..a7cc92e60 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -2523,8 +2523,11 @@ export class PreTrainedTokenizer extends Callable { // Chat templates are stored as lists of dicts with fixed key names, // we reconstruct that into a single dict while loading them. const chat_template = Object.create(null); - for (const item of this.chat_template) { - chat_template[item['name']] = item['template']; + for (const { name, template } of this.chat_template) { + if (typeof name !== 'string' || typeof template !== 'string') { + throw new Error('Chat template must be a list of objects with "name" and "template" properties'); + } + chat_template[name] = template; } this.chat_template = chat_template; } From 20ff0fa3c09549cd205b00879f87497b334b326a Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Wed, 20 Mar 2024 14:54:24 +0200 Subject: [PATCH 15/17] Improved error checking --- src/tokenizers.js | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/tokenizers.js b/src/tokenizers.js index a7cc92e60..755a0c80a 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -3028,12 +3028,15 @@ export class PreTrainedTokenizer extends Callable { (this.chat_template === null && this.default_chat_template && typeof this.default_chat_template === 'object') ) { const template_dict = this.chat_template ?? this.default_chat_template; + if (typeof template_dict !== 'object') { + throw Error(`Expected chat_template to be an object, but got ${typeof template_dict}`); + } - if (chat_template !== null && chat_template in template_dict) { + if (chat_template !== null && template_dict.hasOwnProperty(chat_template)) { // The user can pass the name of a template to the chat template argument instead of an entire template chat_template = template_dict[chat_template]; - } else if (chat_template === null && "default" in template_dict) { - chat_template = template_dict["default"]; + } else if (chat_template === null && 'default' in template_dict) { + chat_template = template_dict['default']; } else if (chat_template === null) { throw Error( `This model has multiple chat templates with no default specified! Please either pass a chat ` + @@ -3046,6 +3049,9 @@ export class PreTrainedTokenizer extends Callable { // priority: `chat_template` argument > `tokenizer.chat_template` > `tokenizer.default_chat_template chat_template ??= this.chat_template ?? this.default_chat_template; } + if (typeof chat_template !== 'string') { + throw Error(`chat_template must be a string, but got ${typeof chat_template}`); + } // Compilation function uses a cache to avoid recompiling the same template let compiledTemplate = this._compiled_template_cache.get(chat_template); From a6aa7264761003f5ca058684cae4381a907bc1e1 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Wed, 20 Mar 2024 14:57:36 +0200 Subject: [PATCH 16/17] Remove redundant error check --- src/tokenizers.js | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/tokenizers.js b/src/tokenizers.js index 755a0c80a..dc0351bc4 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -3027,10 +3027,7 @@ export class PreTrainedTokenizer extends Callable { (this.chat_template && typeof this.chat_template === 'object') || (this.chat_template === null && this.default_chat_template && typeof this.default_chat_template === 'object') ) { - const template_dict = this.chat_template ?? this.default_chat_template; - if (typeof template_dict !== 'object') { - throw Error(`Expected chat_template to be an object, but got ${typeof template_dict}`); - } + const template_dict = this.chat_template ?? this.default_chat_template; // Guaranteed to be a non-null object if (chat_template !== null && template_dict.hasOwnProperty(chat_template)) { // The user can pass the name of a template to the chat template argument instead of an entire template From d26f98612f897b572b3bf4f799accaddccbc3e52 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Wed, 20 Mar 2024 15:00:52 +0200 Subject: [PATCH 17/17] `template_dict` can be a null-prototype object --- src/tokenizers.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tokenizers.js b/src/tokenizers.js index dc0351bc4..5b58e37c0 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -3029,7 +3029,7 @@ export class PreTrainedTokenizer extends Callable { ) { const template_dict = this.chat_template ?? this.default_chat_template; // Guaranteed to be a non-null object - if (chat_template !== null && template_dict.hasOwnProperty(chat_template)) { + if (chat_template !== null && Object.hasOwn(template_dict, chat_template)) { // The user can pass the name of a template to the chat template argument instead of an entire template chat_template = template_dict[chat_template]; } else if (chat_template === null && 'default' in template_dict) {