From b33281a6f436811aabb58b3d1c4837777302a2e9 Mon Sep 17 00:00:00 2001 From: Joshua Lochner <26504141+xenova@users.noreply.github.com> Date: Mon, 10 Nov 2025 16:35:23 -0500 Subject: [PATCH 1/3] Default add_prefix_space=true for metaspace pretokenizer --- src/tokenizers.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tokenizers.js b/src/tokenizers.js index d98502260..0f3d8b42e 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -2287,7 +2287,7 @@ class MetaspacePreTokenizer extends PreTokenizer { constructor(config) { super(); - this.addPrefixSpace = config.add_prefix_space; + this.addPrefixSpace = config.add_prefix_space ?? true; this.replacement = config.replacement; this.strRep = config.str_rep || this.replacement; this.prepend_scheme = config.prepend_scheme ?? 'always'; From 79a83d1122696bcb6aaeaed7af114b180fa2a957 Mon Sep 17 00:00:00 2001 From: Joshua Lochner <26504141+xenova@users.noreply.github.com> Date: Mon, 10 Nov 2025 17:01:05 -0500 Subject: [PATCH 2/3] Remove metaspace add_prefix_space logic (pre_tokenizer + decoder) --- src/tokenizers.js | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/src/tokenizers.js b/src/tokenizers.js index 0f3d8b42e..33cdb3e65 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -2279,7 +2279,6 @@ class VitsDecoder extends Decoder { class MetaspacePreTokenizer extends PreTokenizer { /** * @param {Object} config The configuration object for the MetaspacePreTokenizer. - * @param {boolean} config.add_prefix_space Whether to add a prefix space to the first token. * @param {string} config.replacement The character to replace spaces with. * @param {string} [config.str_rep=config.replacement] An optional string representation of the replacement character. * @param {'first'|'never'|'always'} [config.prepend_scheme='always'] The metaspace prepending scheme. @@ -2287,7 +2286,6 @@ class MetaspacePreTokenizer extends PreTokenizer { constructor(config) { super(); - this.addPrefixSpace = config.add_prefix_space ?? true; this.replacement = config.replacement; this.strRep = config.str_rep || this.replacement; this.prepend_scheme = config.prepend_scheme ?? 'always'; @@ -2309,9 +2307,8 @@ class MetaspacePreTokenizer extends PreTokenizer { if ( // We add a prefix space if: - // (1) The addPrefixSpace option is enabled and the normalized - // token does not already start with the replacement character. - (this.addPrefixSpace && !normalized.startsWith(this.replacement)) + // (1) The normalized token does not already start with the replacement character. + !normalized.startsWith(this.replacement) // and (2) either: // (a) prepend_scheme is 'always' @@ -2335,13 +2332,11 @@ class MetaspaceDecoder extends Decoder { /** * Constructs a new MetaspaceDecoder object. * @param {Object} config The configuration object for the MetaspaceDecoder. - * @param {boolean} config.add_prefix_space Whether to add a prefix space to the decoded string. * @param {string} config.replacement The string to replace spaces with. */ constructor(config) { super(config); - this.addPrefixSpace = config.add_prefix_space; this.replacement = config.replacement; } @@ -2350,7 +2345,7 @@ class MetaspaceDecoder extends Decoder { const result = []; for (let i = 0; i < tokens.length; ++i) { let normalized = tokens[i].replaceAll(this.replacement, ' '); - if (this.addPrefixSpace && i == 0 && normalized.startsWith(' ')) { + if (i == 0 && normalized.startsWith(' ')) { normalized = normalized.substring(1); } result.push(normalized); @@ -3425,7 +3420,6 @@ export class LlamaTokenizer extends PreTrainedTokenizer { this.normalizer = null; this.pre_tokenizer = new MetaspacePreTokenizer({ replacement: SPIECE_UNDERLINE, - add_prefix_space: true, prepend_scheme: "first", }); } From e6e5f501dbba39781cda0ce62895fddd8f823aa6 Mon Sep 17 00:00:00 2001 From: Joshua Lochner <26504141+xenova@users.noreply.github.com> Date: Mon, 10 Nov 2025 17:12:05 -0500 Subject: [PATCH 3/3] Fix bad test! 14 is correct (tested with python library) --- tests/models/jina_clip/test_processor_jina_clip.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/jina_clip/test_processor_jina_clip.js b/tests/models/jina_clip/test_processor_jina_clip.js index 47ac16fe8..07b11c9b8 100644 --- a/tests/models/jina_clip/test_processor_jina_clip.js +++ b/tests/models/jina_clip/test_processor_jina_clip.js @@ -33,8 +33,8 @@ export default () => { // Encode text and images const { input_ids, attention_mask, pixel_values } = await processor(sentences, images, { padding: true, truncation: true }); - expect(input_ids.dims).toEqual([sentences.length, 19]); - expect(attention_mask.dims).toEqual([sentences.length, 19]); + expect(input_ids.dims).toEqual([sentences.length, 14]); + expect(attention_mask.dims).toEqual([sentences.length, 14]); expect(pixel_values.dims).toEqual([images.length, 3, 512, 512]); expect(pixel_values.mean().item()).toBeCloseTo(0.7857685685157776, 6); },