diff --git a/src/tokenizers.js b/src/tokenizers.js index d98502260..33cdb3e65 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -2279,7 +2279,6 @@ class VitsDecoder extends Decoder { class MetaspacePreTokenizer extends PreTokenizer { /** * @param {Object} config The configuration object for the MetaspacePreTokenizer. - * @param {boolean} config.add_prefix_space Whether to add a prefix space to the first token. * @param {string} config.replacement The character to replace spaces with. * @param {string} [config.str_rep=config.replacement] An optional string representation of the replacement character. * @param {'first'|'never'|'always'} [config.prepend_scheme='always'] The metaspace prepending scheme. @@ -2287,7 +2286,6 @@ class MetaspacePreTokenizer extends PreTokenizer { constructor(config) { super(); - this.addPrefixSpace = config.add_prefix_space; this.replacement = config.replacement; this.strRep = config.str_rep || this.replacement; this.prepend_scheme = config.prepend_scheme ?? 'always'; @@ -2309,9 +2307,8 @@ class MetaspacePreTokenizer extends PreTokenizer { if ( // We add a prefix space if: - // (1) The addPrefixSpace option is enabled and the normalized - // token does not already start with the replacement character. - (this.addPrefixSpace && !normalized.startsWith(this.replacement)) + // (1) The normalized token does not already start with the replacement character. + !normalized.startsWith(this.replacement) // and (2) either: // (a) prepend_scheme is 'always' @@ -2335,13 +2332,11 @@ class MetaspaceDecoder extends Decoder { /** * Constructs a new MetaspaceDecoder object. * @param {Object} config The configuration object for the MetaspaceDecoder. - * @param {boolean} config.add_prefix_space Whether to add a prefix space to the decoded string. * @param {string} config.replacement The string to replace spaces with. */ constructor(config) { super(config); - this.addPrefixSpace = config.add_prefix_space; this.replacement = config.replacement; } @@ -2350,7 +2345,7 @@ class MetaspaceDecoder extends Decoder { const result = []; for (let i = 0; i < tokens.length; ++i) { let normalized = tokens[i].replaceAll(this.replacement, ' '); - if (this.addPrefixSpace && i == 0 && normalized.startsWith(' ')) { + if (i == 0 && normalized.startsWith(' ')) { normalized = normalized.substring(1); } result.push(normalized); @@ -3425,7 +3420,6 @@ export class LlamaTokenizer extends PreTrainedTokenizer { this.normalizer = null; this.pre_tokenizer = new MetaspacePreTokenizer({ replacement: SPIECE_UNDERLINE, - add_prefix_space: true, prepend_scheme: "first", }); } diff --git a/tests/models/jina_clip/test_processor_jina_clip.js b/tests/models/jina_clip/test_processor_jina_clip.js index 47ac16fe8..07b11c9b8 100644 --- a/tests/models/jina_clip/test_processor_jina_clip.js +++ b/tests/models/jina_clip/test_processor_jina_clip.js @@ -33,8 +33,8 @@ export default () => { // Encode text and images const { input_ids, attention_mask, pixel_values } = await processor(sentences, images, { padding: true, truncation: true }); - expect(input_ids.dims).toEqual([sentences.length, 19]); - expect(attention_mask.dims).toEqual([sentences.length, 19]); + expect(input_ids.dims).toEqual([sentences.length, 14]); + expect(attention_mask.dims).toEqual([sentences.length, 14]); expect(pixel_values.dims).toEqual([images.length, 3, 512, 512]); expect(pixel_values.mean().item()).toBeCloseTo(0.7857685685157776, 6); },