From 373d901974332d42099f266e8a048006d7b4cd3a Mon Sep 17 00:00:00 2001 From: Joshua Lochner <26504141+xenova@users.noreply.github.com> Date: Thu, 16 Oct 2025 08:30:27 -0700 Subject: [PATCH 1/2] Add support for NanoChat https://github.com/huggingface/transformers/pull/41634 --- src/configs.js | 1 + src/models.js | 8 ++++++++ src/tokenizers.js | 7 ++++++- 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/configs.js b/src/configs.js index 47e553927..e5c660a73 100644 --- a/src/configs.js +++ b/src/configs.js @@ -112,6 +112,7 @@ function getNormalizedConfig(config) { break; case 'llama': case 'llama4_text': + case 'nanochat': case 'arcee': case 'lfm2': case 'smollm3': diff --git a/src/models.js b/src/models.js index 075f3a088..e5bac4328 100644 --- a/src/models.js +++ b/src/models.js @@ -4585,6 +4585,12 @@ export class Llama4PreTrainedModel extends PreTrainedModel { } export class Llama4ForCausalLM extends Llama4PreTrainedModel { } ////////////////////////////////////////////////// +////////////////////////////////////////////////// +// NanoChat models +export class NanoChatPreTrainedModel extends PreTrainedModel { } +export class NanoChatModel extends NanoChatPreTrainedModel { } +export class NanoChatForCausalLM extends NanoChatPreTrainedModel { } +////////////////////////////////////////////////// ////////////////////////////////////////////////// // Arcee models @@ -7845,6 +7851,7 @@ const MODEL_MAPPING_NAMES_DECODER_ONLY = new Map([ ['gpt_neox', ['GPTNeoXModel', GPTNeoXModel]], ['codegen', ['CodeGenModel', CodeGenModel]], ['llama', ['LlamaModel', LlamaModel]], + ['nanochat', ['NanoChatModel', NanoChatModel]], ['arcee', ['ArceeModel', ArceeModel]], ['lfm2', ['Lfm2Model', Lfm2Model]], ['smollm3', ['SmolLM3Model', SmolLM3Model]], @@ -7955,6 +7962,7 @@ const MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = new Map([ ['gpt_neox', ['GPTNeoXForCausalLM', GPTNeoXForCausalLM]], ['codegen', ['CodeGenForCausalLM', CodeGenForCausalLM]], ['llama', ['LlamaForCausalLM', LlamaForCausalLM]], + ['nanochat', ['NanoChatForCausalLM', NanoChatForCausalLM]], ['llama4_text', ['Llama4ForCausalLM', Llama4ForCausalLM]], ['arcee', ['ArceeForCausalLM', ArceeForCausalLM]], ['lfm2', ['Lfm2ForCausalLM', Lfm2ForCausalLM]], diff --git a/src/tokenizers.js b/src/tokenizers.js index 2e617f93b..d98502260 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -278,9 +278,14 @@ const BLOOM_SPLIT_CHARS = '.,!?\u2026\u3002\uff0c\u3001\u0964\u06d4\u060c'; // A mapping of regex patterns to their equivalent (but possibly longer) JS-compatible versions. const PROBLEMATIC_REGEX_MAP = new Map([ - // This uses the case insensitive group modifier, which is not supported in JavaScript. + // These use the case insensitive group modifier, which is not supported in JavaScript. // When parsing the regex, an "Invalid group" error is thrown. ["(?i:'s|'t|'re|'ve|'m|'ll|'d)", "(?:'([sS]|[tT]|[rR][eE]|[vV][eE]|[mM]|[lL][lL]|[dD]))"], + ["(?i:[sdmt]|ll|ve|re)", "(?:[sS]|[dD]|[mM]|[tT]|[lL][lL]|[vV][eE]|[rR][eE])"], + + // JS doesn't support possessive quantifiers (these are used in recent OpenAI tokenizers). + ["[^\\r\\n\\p{L}\\p{N}]?+", "[^\\r\\n\\p{L}\\p{N}]?"], + ["[^\\s\\p{L}\\p{N}]++", "[^\\s\\p{L}\\p{N}]+"], // Used to override the default (invalid) regex of the bloom pretokenizer. // For more information, see https://github.com/huggingface/transformers.js/issues/94 From 18da61770053e2bab5118207df09793dd9013eb4 Mon Sep 17 00:00:00 2001 From: Joshua Lochner <26504141+xenova@users.noreply.github.com> Date: Thu, 16 Oct 2025 08:33:37 -0700 Subject: [PATCH 2/2] Add nanochat to supported models list --- README.md | 1 + docs/snippets/6_supported-models.snippet | 1 + 2 files changed, 2 insertions(+) diff --git a/README.md b/README.md index 7a51c5aaf..bae8aefda 100644 --- a/README.md +++ b/README.md @@ -393,6 +393,7 @@ You can refine your search by selecting the task you're interested in (e.g., [te 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://huggingface.co/papers/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. 1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaicML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team. 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://huggingface.co/papers/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. +1. **[NanoChat](https://huggingface.co/docs/transformers/model_doc/nanochat)** released with the repository [nanochat: The best ChatGPT that $100 can buy](https://github.com/karpathy/nanochat) by Andrej Karpathy. 1. **NeoBERT** (from Chandar Research Lab) released with the paper [NeoBERT: A Next-Generation BERT](https://huggingface.co/papers/2502.19587) by Lola Le Breton, Quentin Fournier, Mariam El Mezouar, John X. Morris, Sarath Chandar. 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://huggingface.co/papers/2207.04672) by the NLLB team. 1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (from Meta AI) released with the paper [Nougat: Neural Optical Understanding for Academic Documents](https://huggingface.co/papers/2308.13418) by Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic. diff --git a/docs/snippets/6_supported-models.snippet b/docs/snippets/6_supported-models.snippet index faca40aaa..1c7cff016 100644 --- a/docs/snippets/6_supported-models.snippet +++ b/docs/snippets/6_supported-models.snippet @@ -107,6 +107,7 @@ 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://huggingface.co/papers/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. 1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaicML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team. 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://huggingface.co/papers/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. +1. **[NanoChat](https://huggingface.co/docs/transformers/model_doc/nanochat)** released with the repository [nanochat: The best ChatGPT that $100 can buy](https://github.com/karpathy/nanochat) by Andrej Karpathy. 1. **NeoBERT** (from Chandar Research Lab) released with the paper [NeoBERT: A Next-Generation BERT](https://huggingface.co/papers/2502.19587) by Lola Le Breton, Quentin Fournier, Mariam El Mezouar, John X. Morris, Sarath Chandar. 1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://huggingface.co/papers/2207.04672) by the NLLB team. 1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (from Meta AI) released with the paper [Nougat: Neural Optical Understanding for Academic Documents](https://huggingface.co/papers/2308.13418) by Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic.