From c4c397e1710e8615bde0a7082f38ed18d8a816ed Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Wed, 21 Feb 2024 14:29:08 +0200 Subject: [PATCH 1/7] Fix styling for whitespace tokens --- examples/tokenizer-playground/src/components/Token.jsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tokenizer-playground/src/components/Token.jsx b/examples/tokenizer-playground/src/components/Token.jsx index b49fa7a07..579b000f9 100644 --- a/examples/tokenizer-playground/src/components/Token.jsx +++ b/examples/tokenizer-playground/src/components/Token.jsx @@ -14,7 +14,7 @@ export function Token({ text, position, margin }) { + className={`leading-5 ${COLOURS[position % COLOURS.length]}`}> {text} ) :
) From 206f74d4fb207d1408bcf9bcd14efd80dbec25ec Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Wed, 21 Feb 2024 15:05:27 +0200 Subject: [PATCH 2/7] Add `GemmaTokenizer` --- src/tokenizers.js | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/tokenizers.js b/src/tokenizers.js index 563e39319..9692cf3b0 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -3204,6 +3204,10 @@ export class EsmTokenizer extends PreTrainedTokenizer { } export class Qwen2Tokenizer extends PreTrainedTokenizer { } +export class GemmaTokenizer extends PreTrainedTokenizer { + _default_chat_template = "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}" +} + /** * Helper function to build translation inputs for an `NllbTokenizer` or `M2M100Tokenizer`. * @param {PreTrainedTokenizer} self The tokenizer instance. @@ -4309,6 +4313,7 @@ export class AutoTokenizer { NougatTokenizer, VitsTokenizer, Qwen2Tokenizer, + GemmaTokenizer, // Base case: PreTrainedTokenizer, From 30c7dc60e5c653999f67901b62fdf0847ea4ae25 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Wed, 21 Feb 2024 15:06:05 +0200 Subject: [PATCH 3/7] Update minimum `@huggingface/jinja` version --- package-lock.json | 8 ++++---- package.json | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/package-lock.json b/package-lock.json index c953c7232..49fb65a51 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,7 +9,7 @@ "version": "2.15.0", "license": "Apache-2.0", "dependencies": { - "@huggingface/jinja": "^0.1.0", + "@huggingface/jinja": "^0.1.3", "onnxruntime-web": "1.14.0", "sharp": "^0.32.0" }, @@ -745,9 +745,9 @@ } }, "node_modules/@huggingface/jinja": { - "version": "0.1.0", - "resolved": "https://registry.npmjs.org/@huggingface/jinja/-/jinja-0.1.0.tgz", - "integrity": "sha512-NgZ0imvGPHblw+nFJN2eC+so0DmvLSEieldI7gjZZbBUDE80ypG1O+DibdeWne1vQuGBYV/pC3XL//SgxiXC7g==", + "version": "0.1.3", + "resolved": "https://registry.npmjs.org/@huggingface/jinja/-/jinja-0.1.3.tgz", + "integrity": "sha512-9KsiorsdIK8+7VmlamAT7Uh90zxAhC/SeKaKc80v58JhtPYuwaJpmR/ST7XAUxrHAFqHTCoTH5aJnJDwSL6xIQ==", "engines": { "node": ">=18" } diff --git a/package.json b/package.json index ff14f5c97..6fe63e41f 100644 --- a/package.json +++ b/package.json @@ -40,7 +40,7 @@ "dependencies": { "onnxruntime-web": "1.14.0", "sharp": "^0.32.0", - "@huggingface/jinja": "^0.1.0" + "@huggingface/jinja": "^0.1.3" }, "optionalDependencies": { "onnxruntime-node": "1.14.0" From 2449f1c2aef0189f19abc4c6377e0e540e0df770 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Wed, 21 Feb 2024 15:06:24 +0200 Subject: [PATCH 4/7] Add Gemma to tokenizer playground --- examples/tokenizer-playground/src/App.jsx | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/tokenizer-playground/src/App.jsx b/examples/tokenizer-playground/src/App.jsx index fd6deb077..d15bd159a 100644 --- a/examples/tokenizer-playground/src/App.jsx +++ b/examples/tokenizer-playground/src/App.jsx @@ -70,6 +70,7 @@ function App() { + From fc16dc72980f90e30bcb46a4cae9c3c75f206ef0 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Wed, 21 Feb 2024 15:34:14 +0200 Subject: [PATCH 5/7] Add Gemma tokenizer unit test --- tests/generate_tests.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/generate_tests.py b/tests/generate_tests.py index 81836b901..40725f812 100644 --- a/tests/generate_tests.py +++ b/tests/generate_tests.py @@ -36,6 +36,9 @@ # Uses a pretokenizer regex which is not compatible with JavaScript. 'Qwen/Qwen1.5-0.5B-Chat', ], + 'gemma': [ + 'hf-internal-testing/dummy-gemma', + ], } MODELS_TO_IGNORE = [ From 3f5dc16a43a1c31db67753a1d78f827345579f2f Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Wed, 21 Feb 2024 15:59:05 +0200 Subject: [PATCH 6/7] Update tokenizer names in playground --- examples/tokenizer-playground/src/App.jsx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/tokenizer-playground/src/App.jsx b/examples/tokenizer-playground/src/App.jsx index d15bd159a..c6546d20f 100644 --- a/examples/tokenizer-playground/src/App.jsx +++ b/examples/tokenizer-playground/src/App.jsx @@ -70,8 +70,8 @@ function App() { - - + + From 30193007c8f921308f74ad1fc8e6e6b9407fb89f Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Wed, 21 Feb 2024 15:59:31 +0200 Subject: [PATCH 7/7] Update Gemma tokenizer test --- tests/generate_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/generate_tests.py b/tests/generate_tests.py index 40725f812..c449d34be 100644 --- a/tests/generate_tests.py +++ b/tests/generate_tests.py @@ -37,7 +37,7 @@ 'Qwen/Qwen1.5-0.5B-Chat', ], 'gemma': [ - 'hf-internal-testing/dummy-gemma', + 'Xenova/gemma-tokenizer', ], }