From 1ca933476216db534935ddbb2746319cdbce9195 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 6 Nov 2025 21:11:53 +0000
Subject: [PATCH 1/6] Initial plan


From eb98dac92e3d2edbb0af46d1f3247bbd122ac7f4 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 6 Nov 2025 21:21:07 +0000
Subject: [PATCH 2/6] Add token_to_id, id_to_token, and
 get_added_tokens_decoder methods to Tokenizer class

Co-authored-by: xenova <26504141+xenova@users.noreply.github.com>
---
 src/core/Tokenizer.ts           |  30 ++++++
 src/index.ts                    |   1 +
 tests/tokenizer-methods.test.ts | 166 ++++++++++++++++++++++++++++++++
 3 files changed, 197 insertions(+)
 create mode 100644 tests/tokenizer-methods.test.ts

diff --git a/src/core/Tokenizer.ts b/src/core/Tokenizer.ts
index 4086af4..fc6ca22 100644
--- a/src/core/Tokenizer.ts
+++ b/src/core/Tokenizer.ts
@@ -292,6 +292,36 @@ class Tokenizer {
       ? this.post_processor(tokens1, tokens2, add_special_tokens)
       : { tokens: merge_arrays(tokens1 ?? [], tokens2 ?? []) };
   }
+
+  /**
+   * Converts a token string to its corresponding token ID.
+   * @param token The token string to convert.
+   * @returns The token ID, or undefined if the token is not in the vocabulary.
+   */
+  public token_to_id(token: string): number | undefined {
+    return this.model.tokens_to_ids.get(token);
+  }
+
+  /**
+   * Converts a token ID to its corresponding token string.
+   * @param id The token ID to convert.
+   * @returns The token string, or undefined if the ID is not in the vocabulary.
+   */
+  public id_to_token(id: number): string | undefined {
+    return this.model.vocab[id];
+  }
+
+  /**
+   * Returns a mapping of token IDs to AddedToken objects for all added tokens.
+   * @returns A Map where keys are token IDs and values are AddedToken objects.
+   */
+  public get_added_tokens_decoder(): Map<number, AddedToken> {
+    const decoder = new Map<number, AddedToken>();
+    for (const token of this.added_tokens) {
+      decoder.set(token.id, token);
+    }
+    return decoder;
+  }
 }
 
 export default Tokenizer;
diff --git a/src/index.ts b/src/index.ts
index 81573ea..d1c639d 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -1,4 +1,5 @@
 export { default as Tokenizer } from "./core/Tokenizer";
+export { default as AddedToken } from "./core/AddedToken";
 export type { Encoding } from "./static/types";
 
 // Export all components
diff --git a/tests/tokenizer-methods.test.ts b/tests/tokenizer-methods.test.ts
new file mode 100644
index 0000000..4bf1b8a
--- /dev/null
+++ b/tests/tokenizer-methods.test.ts
@@ -0,0 +1,166 @@
+import { Tokenizer, AddedToken } from "../src";
+
+describe("Tokenizer methods - API validation", () => {
+  describe("Method signatures", () => {
+    // Create a minimal tokenizer config for testing
+    const minimalTokenizerJson = {
+      version: "1.0",
+      truncation: null as any,
+      padding: null as any,
+      added_tokens: [
+        {
+          id: 50256,
+          content: "<|endoftext|>",
+          single_word: false,
+          lstrip: false,
+          rstrip: false,
+          normalized: false,
+          special: true,
+        },
+      ],
+      normalizer: null as any,
+      pre_tokenizer: null as any,
+      post_processor: null as any,
+      decoder: null as any,
+      model: {
+        type: "BPE",
+        dropout: null as any,
+        unk_token: null as any,
+        continuing_subword_prefix: null as any,
+        end_of_word_suffix: null as any,
+        fuse_unk: false,
+        byte_fallback: false,
+        ignore_merges: false,
+        vocab: {
+          hello: 31373,
+          world: 995,
+          "<|endoftext|>": 50256,
+        },
+        merges: [] as any[],
+      },
+    };
+
+    const minimalTokenizerConfig = {
+      add_bos_token: false,
+      add_prefix_space: false,
+      added_tokens_decoder: {},
+      bos_token: null as any,
+      clean_up_tokenization_spaces: true,
+      eos_token: null as any,
+      model_max_length: 1000000000000000,
+      pad_token: null as any,
+      sp_model_kwargs: {},
+      spaces_between_special_tokens: false,
+      tokenizer_class: "GPT2Tokenizer",
+      unk_token: null as any,
+      use_default_system_prompt: false,
+    };
+
+    let tokenizer: Tokenizer;
+
+    beforeAll(() => {
+      tokenizer = new Tokenizer(minimalTokenizerJson, minimalTokenizerConfig);
+    });
+
+    describe("token_to_id", () => {
+      test("should be a function", () => {
+        expect(typeof tokenizer.token_to_id).toBe("function");
+      });
+
+      test("should return correct ID for existing token", () => {
+        const id = tokenizer.token_to_id("hello");
+        expect(id).toBe(31373);
+      });
+
+      test("should return correct ID for special token", () => {
+        const id = tokenizer.token_to_id("<|endoftext|>");
+        expect(id).toBe(50256);
+      });
+
+      test("should return undefined for non-existing token", () => {
+        const id = tokenizer.token_to_id("nonexistenttoken12345");
+        expect(id).toBeUndefined();
+      });
+
+      test("should return correct ID for world token", () => {
+        const id = tokenizer.token_to_id("world");
+        expect(id).toBe(995);
+      });
+    });
+
+    describe("id_to_token", () => {
+      test("should be a function", () => {
+        expect(typeof tokenizer.id_to_token).toBe("function");
+      });
+
+      test("should return correct token for existing ID", () => {
+        const token = tokenizer.id_to_token(31373);
+        expect(token).toBe("hello");
+      });
+
+      test("should return correct token for special token ID", () => {
+        const token = tokenizer.id_to_token(50256);
+        expect(token).toBe("<|endoftext|>");
+      });
+
+      test("should return undefined for non-existing ID", () => {
+        const token = tokenizer.id_to_token(999999);
+        expect(token).toBeUndefined();
+      });
+
+      test("should return correct token for world ID", () => {
+        const token = tokenizer.id_to_token(995);
+        expect(token).toBe("world");
+      });
+    });
+
+    describe("get_added_tokens_decoder", () => {
+      test("should be a function", () => {
+        expect(typeof tokenizer.get_added_tokens_decoder).toBe("function");
+      });
+
+      test("should return a Map", () => {
+        const decoder = tokenizer.get_added_tokens_decoder();
+        expect(decoder).toBeInstanceOf(Map);
+      });
+
+      test("should contain special token", () => {
+        const decoder = tokenizer.get_added_tokens_decoder();
+        expect(decoder.has(50256)).toBe(true);
+
+        const addedToken = decoder.get(50256);
+        expect(addedToken).toBeDefined();
+        expect(addedToken?.content).toBe("<|endoftext|>");
+        expect(addedToken?.special).toBe(true);
+      });
+
+      test("should return AddedToken objects with correct properties", () => {
+        const decoder = tokenizer.get_added_tokens_decoder();
+        const addedToken = decoder.get(50256);
+
+        expect(addedToken).toHaveProperty("content");
+        expect(addedToken).toHaveProperty("id");
+        expect(addedToken).toHaveProperty("special");
+        expect(addedToken).toHaveProperty("lstrip");
+        expect(addedToken).toHaveProperty("rstrip");
+        expect(addedToken).toHaveProperty("single_word");
+        expect(addedToken).toHaveProperty("normalized");
+      });
+
+      test("should return AddedToken instances", () => {
+        const decoder = tokenizer.get_added_tokens_decoder();
+        const addedToken = decoder.get(50256);
+        expect(addedToken).toBeInstanceOf(AddedToken);
+      });
+
+      test("roundtrip: token_to_id and id_to_token should be inverse operations", () => {
+        const token = "hello";
+        const id = tokenizer.token_to_id(token);
+        expect(id).toBeDefined();
+
+        const tokenBack = tokenizer.id_to_token(id!);
+        expect(tokenBack).toBe(token);
+      });
+    });
+  });
+});

From 67f126116390cbdcea5151b1e0f728b57f4653a2 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 6 Nov 2025 21:37:51 +0000
Subject: [PATCH 3/6] Simplify unit tests with minimal BPE tokenizer
 configuration

Co-authored-by: xenova <26504141+xenova@users.noreply.github.com>
---
 tests/tokenizer-methods.test.ts | 309 ++++++++++++++++----------------
 1 file changed, 159 insertions(+), 150 deletions(-)

diff --git a/tests/tokenizer-methods.test.ts b/tests/tokenizer-methods.test.ts
index 4bf1b8a..c4b49bb 100644
--- a/tests/tokenizer-methods.test.ts
+++ b/tests/tokenizer-methods.test.ts
@@ -1,166 +1,175 @@
 import { Tokenizer, AddedToken } from "../src";
 
-describe("Tokenizer methods - API validation", () => {
-  describe("Method signatures", () => {
-    // Create a minimal tokenizer config for testing
-    const minimalTokenizerJson = {
-      version: "1.0",
-      truncation: null as any,
-      padding: null as any,
-      added_tokens: [
-        {
-          id: 50256,
-          content: "<|endoftext|>",
-          single_word: false,
-          lstrip: false,
-          rstrip: false,
-          normalized: false,
-          special: true,
-        },
-      ],
-      normalizer: null as any,
-      pre_tokenizer: null as any,
-      post_processor: null as any,
-      decoder: null as any,
-      model: {
-        type: "BPE",
-        dropout: null as any,
-        unk_token: null as any,
-        continuing_subword_prefix: null as any,
-        end_of_word_suffix: null as any,
-        fuse_unk: false,
-        byte_fallback: false,
-        ignore_merges: false,
-        vocab: {
-          hello: 31373,
-          world: 995,
-          "<|endoftext|>": 50256,
-        },
-        merges: [] as any[],
+describe("Tokenizer methods", () => {
+  // Create a simple BPE tokenizer for testing
+  // Vocab size: 9 tokens
+  // - 3 special tokens: <s>, </s>, <pad>
+  // - 1 unk token: <unk>
+  // - 5 regular tokens: a, b, c, ab, bc
+  const tokenizerJson = {
+    version: "1.0",
+    truncation: null as any,
+    padding: null as any,
+    added_tokens: [
+      { id: 0, content: "<unk>", single_word: false, lstrip: false, rstrip: false, normalized: false, special: true },
+      { id: 1, content: "<s>", single_word: false, lstrip: false, rstrip: false, normalized: false, special: true },
+      { id: 2, content: "</s>", single_word: false, lstrip: false, rstrip: false, normalized: false, special: true },
+      { id: 3, content: "<pad>", single_word: false, lstrip: false, rstrip: false, normalized: false, special: true },
+    ],
+    normalizer: null as any,
+    pre_tokenizer: null as any,
+    post_processor: null as any,
+    decoder: null as any,
+    model: {
+      type: "BPE",
+      dropout: null as any,
+      unk_token: "<unk>",
+      continuing_subword_prefix: null as any,
+      end_of_word_suffix: null as any,
+      fuse_unk: false,
+      byte_fallback: false,
+      ignore_merges: false,
+      vocab: {
+        "<unk>": 0,
+        "<s>": 1,
+        "</s>": 2,
+        "<pad>": 3,
+        "a": 4,
+        "b": 5,
+        "c": 6,
+        "ab": 7,
+        "bc": 8,
       },
-    };
-
-    const minimalTokenizerConfig = {
-      add_bos_token: false,
-      add_prefix_space: false,
-      added_tokens_decoder: {},
-      bos_token: null as any,
-      clean_up_tokenization_spaces: true,
-      eos_token: null as any,
-      model_max_length: 1000000000000000,
-      pad_token: null as any,
-      sp_model_kwargs: {},
-      spaces_between_special_tokens: false,
-      tokenizer_class: "GPT2Tokenizer",
-      unk_token: null as any,
-      use_default_system_prompt: false,
-    };
-
-    let tokenizer: Tokenizer;
-
-    beforeAll(() => {
-      tokenizer = new Tokenizer(minimalTokenizerJson, minimalTokenizerConfig);
+      merges: [
+        ["a", "b"],
+        ["b", "c"],
+      ] as any[],
+    },
+  };
+
+  const tokenizerConfig = {
+    add_bos_token: false,
+    add_prefix_space: false,
+    added_tokens_decoder: {
+      "0": { id: 0, content: "<unk>", special: true },
+      "1": { id: 1, content: "<s>", special: true },
+      "2": { id: 2, content: "</s>", special: true },
+      "3": { id: 3, content: "<pad>", special: true },
+    },
+    bos_token: "<s>",
+    clean_up_tokenization_spaces: false,
+    eos_token: "</s>",
+    legacy: true,
+    model_max_length: 1000000000000000,
+    pad_token: "<pad>",
+    sp_model_kwargs: {},
+    spaces_between_special_tokens: false,
+    tokenizer_class: "LlamaTokenizer",
+    unk_token: "<unk>",
+    use_default_system_prompt: false,
+  };
+
+  let tokenizer: Tokenizer;
+
+  beforeAll(() => {
+    tokenizer = new Tokenizer(tokenizerJson, tokenizerConfig);
+  });
+
+  describe("token_to_id", () => {
+    test("should return correct ID for regular token", () => {
+      expect(tokenizer.token_to_id("a")).toBe(4);
+      expect(tokenizer.token_to_id("b")).toBe(5);
+      expect(tokenizer.token_to_id("c")).toBe(6);
+    });
+
+    test("should return correct ID for merged token", () => {
+      expect(tokenizer.token_to_id("ab")).toBe(7);
+      expect(tokenizer.token_to_id("bc")).toBe(8);
+    });
+
+    test("should return correct ID for special tokens", () => {
+      expect(tokenizer.token_to_id("<unk>")).toBe(0);
+      expect(tokenizer.token_to_id("<s>")).toBe(1);
+      expect(tokenizer.token_to_id("</s>")).toBe(2);
+      expect(tokenizer.token_to_id("<pad>")).toBe(3);
+    });
+
+    test("should return undefined for non-existing token", () => {
+      expect(tokenizer.token_to_id("xyz")).toBeUndefined();
+    });
+  });
+
+  describe("id_to_token", () => {
+    test("should return correct token for regular token ID", () => {
+      expect(tokenizer.id_to_token(4)).toBe("a");
+      expect(tokenizer.id_to_token(5)).toBe("b");
+      expect(tokenizer.id_to_token(6)).toBe("c");
+    });
+
+    test("should return correct token for merged token ID", () => {
+      expect(tokenizer.id_to_token(7)).toBe("ab");
+      expect(tokenizer.id_to_token(8)).toBe("bc");
+    });
+
+    test("should return correct token for special token ID", () => {
+      expect(tokenizer.id_to_token(0)).toBe("<unk>");
+      expect(tokenizer.id_to_token(1)).toBe("<s>");
+      expect(tokenizer.id_to_token(2)).toBe("</s>");
+      expect(tokenizer.id_to_token(3)).toBe("<pad>");
+    });
+
+    test("should return undefined for non-existing ID", () => {
+      expect(tokenizer.id_to_token(999)).toBeUndefined();
+    });
+  });
+
+  describe("get_added_tokens_decoder", () => {
+    test("should return a Map", () => {
+      const decoder = tokenizer.get_added_tokens_decoder();
+      expect(decoder).toBeInstanceOf(Map);
+    });
+
+    test("should contain all special tokens", () => {
+      const decoder = tokenizer.get_added_tokens_decoder();
+      expect(decoder.size).toBe(4);
+      expect(decoder.has(0)).toBe(true);
+      expect(decoder.has(1)).toBe(true);
+      expect(decoder.has(2)).toBe(true);
+      expect(decoder.has(3)).toBe(true);
     });
 
-    describe("token_to_id", () => {
-      test("should be a function", () => {
-        expect(typeof tokenizer.token_to_id).toBe("function");
-      });
-
-      test("should return correct ID for existing token", () => {
-        const id = tokenizer.token_to_id("hello");
-        expect(id).toBe(31373);
-      });
-
-      test("should return correct ID for special token", () => {
-        const id = tokenizer.token_to_id("<|endoftext|>");
-        expect(id).toBe(50256);
-      });
-
-      test("should return undefined for non-existing token", () => {
-        const id = tokenizer.token_to_id("nonexistenttoken12345");
-        expect(id).toBeUndefined();
-      });
-
-      test("should return correct ID for world token", () => {
-        const id = tokenizer.token_to_id("world");
-        expect(id).toBe(995);
-      });
+    test("should return AddedToken objects with correct properties", () => {
+      const decoder = tokenizer.get_added_tokens_decoder();
+      
+      const unkToken = decoder.get(0);
+      expect(unkToken).toBeDefined();
+      expect(unkToken?.content).toBe("<unk>");
+      expect(unkToken?.special).toBe(true);
+      expect(unkToken).toBeInstanceOf(AddedToken);
+
+      const bosToken = decoder.get(1);
+      expect(bosToken?.content).toBe("<s>");
+      expect(bosToken?.special).toBe(true);
     });
 
-    describe("id_to_token", () => {
-      test("should be a function", () => {
-        expect(typeof tokenizer.id_to_token).toBe("function");
-      });
-
-      test("should return correct token for existing ID", () => {
-        const token = tokenizer.id_to_token(31373);
-        expect(token).toBe("hello");
-      });
-
-      test("should return correct token for special token ID", () => {
-        const token = tokenizer.id_to_token(50256);
-        expect(token).toBe("<|endoftext|>");
-      });
-
-      test("should return undefined for non-existing ID", () => {
-        const token = tokenizer.id_to_token(999999);
-        expect(token).toBeUndefined();
-      });
-
-      test("should return correct token for world ID", () => {
-        const token = tokenizer.id_to_token(995);
-        expect(token).toBe("world");
-      });
+    test("should not contain regular tokens", () => {
+      const decoder = tokenizer.get_added_tokens_decoder();
+      expect(decoder.has(4)).toBe(false);
+      expect(decoder.has(5)).toBe(false);
+      expect(decoder.has(6)).toBe(false);
     });
+  });
 
-    describe("get_added_tokens_decoder", () => {
-      test("should be a function", () => {
-        expect(typeof tokenizer.get_added_tokens_decoder).toBe("function");
-      });
-
-      test("should return a Map", () => {
-        const decoder = tokenizer.get_added_tokens_decoder();
-        expect(decoder).toBeInstanceOf(Map);
-      });
-
-      test("should contain special token", () => {
-        const decoder = tokenizer.get_added_tokens_decoder();
-        expect(decoder.has(50256)).toBe(true);
-
-        const addedToken = decoder.get(50256);
-        expect(addedToken).toBeDefined();
-        expect(addedToken?.content).toBe("<|endoftext|>");
-        expect(addedToken?.special).toBe(true);
-      });
-
-      test("should return AddedToken objects with correct properties", () => {
-        const decoder = tokenizer.get_added_tokens_decoder();
-        const addedToken = decoder.get(50256);
-
-        expect(addedToken).toHaveProperty("content");
-        expect(addedToken).toHaveProperty("id");
-        expect(addedToken).toHaveProperty("special");
-        expect(addedToken).toHaveProperty("lstrip");
-        expect(addedToken).toHaveProperty("rstrip");
-        expect(addedToken).toHaveProperty("single_word");
-        expect(addedToken).toHaveProperty("normalized");
-      });
-
-      test("should return AddedToken instances", () => {
-        const decoder = tokenizer.get_added_tokens_decoder();
-        const addedToken = decoder.get(50256);
-        expect(addedToken).toBeInstanceOf(AddedToken);
-      });
-
-      test("roundtrip: token_to_id and id_to_token should be inverse operations", () => {
-        const token = "hello";
+  describe("roundtrip conversions", () => {
+    test("token_to_id and id_to_token should be inverse operations", () => {
+      const tokens = ["a", "b", "c", "ab", "bc", "<unk>", "<s>", "</s>", "<pad>"];
+      
+      for (const token of tokens) {
         const id = tokenizer.token_to_id(token);
         expect(id).toBeDefined();
-
         const tokenBack = tokenizer.id_to_token(id!);
         expect(tokenBack).toBe(token);
-      });
+      }
     });
   });
 });

From 3068a0f31750fdeef77e72b80fef59b4fa69c3f7 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 6 Nov 2025 21:40:24 +0000
Subject: [PATCH 4/6] Refactor test code to reduce duplication and improve
 maintainability

Co-authored-by: xenova <26504141+xenova@users.noreply.github.com>
---
 tests/tokenizer-methods.test.ts | 39 ++++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/tests/tokenizer-methods.test.ts b/tests/tokenizer-methods.test.ts
index c4b49bb..7d38eea 100644
--- a/tests/tokenizer-methods.test.ts
+++ b/tests/tokenizer-methods.test.ts
@@ -6,16 +6,30 @@ describe("Tokenizer methods", () => {
   // - 3 special tokens: <s>, </s>, <pad>
   // - 1 unk token: <unk>
   // - 5 regular tokens: a, b, c, ab, bc
+  
+  // Helper to create added token configuration
+  const createAddedToken = (id: number, content: string) => ({
+    id,
+    content,
+    single_word: false,
+    lstrip: false,
+    rstrip: false,
+    normalized: false,
+    special: true,
+  });
+
+  const addedTokens = [
+    createAddedToken(0, "<unk>"),
+    createAddedToken(1, "<s>"),
+    createAddedToken(2, "</s>"),
+    createAddedToken(3, "<pad>"),
+  ];
+
   const tokenizerJson = {
     version: "1.0",
     truncation: null as any,
     padding: null as any,
-    added_tokens: [
-      { id: 0, content: "<unk>", single_word: false, lstrip: false, rstrip: false, normalized: false, special: true },
-      { id: 1, content: "<s>", single_word: false, lstrip: false, rstrip: false, normalized: false, special: true },
-      { id: 2, content: "</s>", single_word: false, lstrip: false, rstrip: false, normalized: false, special: true },
-      { id: 3, content: "<pad>", single_word: false, lstrip: false, rstrip: false, normalized: false, special: true },
-    ],
+    added_tokens: addedTokens,
     normalizer: null as any,
     pre_tokenizer: null as any,
     post_processor: null as any,
@@ -50,12 +64,12 @@ describe("Tokenizer methods", () => {
   const tokenizerConfig = {
     add_bos_token: false,
     add_prefix_space: false,
-    added_tokens_decoder: {
-      "0": { id: 0, content: "<unk>", special: true },
-      "1": { id: 1, content: "<s>", special: true },
-      "2": { id: 2, content: "</s>", special: true },
-      "3": { id: 3, content: "<pad>", special: true },
-    },
+    added_tokens_decoder: Object.fromEntries(
+      addedTokens.map(token => [
+        String(token.id),
+        { id: token.id, content: token.content, special: token.special },
+      ])
+    ),
     bos_token: "<s>",
     clean_up_tokenization_spaces: false,
     eos_token: "</s>",
@@ -140,7 +154,6 @@ describe("Tokenizer methods", () => {
 
     test("should return AddedToken objects with correct properties", () => {
       const decoder = tokenizer.get_added_tokens_decoder();
-      
       const unkToken = decoder.get(0);
       expect(unkToken).toBeDefined();
       expect(unkToken?.content).toBe("<unk>");

From 5c1551df969c530ebee018bd4ea97c29540210b5 Mon Sep 17 00:00:00 2001
From: Joshua Lochner <26504141+xenova@users.noreply.github.com>
Date: Thu, 6 Nov 2025 17:32:29 -0500
Subject: [PATCH 5/6] Update tests

---
 tests/tokenizer-methods.test.ts | 188 -----------------------------
 tests/tokenizers.test.ts        | 202 +++++++++++++++++++++++++++++++-
 2 files changed, 201 insertions(+), 189 deletions(-)
 delete mode 100644 tests/tokenizer-methods.test.ts

diff --git a/tests/tokenizer-methods.test.ts b/tests/tokenizer-methods.test.ts
deleted file mode 100644
index 7d38eea..0000000
--- a/tests/tokenizer-methods.test.ts
+++ /dev/null
@@ -1,188 +0,0 @@
-import { Tokenizer, AddedToken } from "../src";
-
-describe("Tokenizer methods", () => {
-  // Create a simple BPE tokenizer for testing
-  // Vocab size: 9 tokens
-  // - 3 special tokens: <s>, </s>, <pad>
-  // - 1 unk token: <unk>
-  // - 5 regular tokens: a, b, c, ab, bc
-  
-  // Helper to create added token configuration
-  const createAddedToken = (id: number, content: string) => ({
-    id,
-    content,
-    single_word: false,
-    lstrip: false,
-    rstrip: false,
-    normalized: false,
-    special: true,
-  });
-
-  const addedTokens = [
-    createAddedToken(0, "<unk>"),
-    createAddedToken(1, "<s>"),
-    createAddedToken(2, "</s>"),
-    createAddedToken(3, "<pad>"),
-  ];
-
-  const tokenizerJson = {
-    version: "1.0",
-    truncation: null as any,
-    padding: null as any,
-    added_tokens: addedTokens,
-    normalizer: null as any,
-    pre_tokenizer: null as any,
-    post_processor: null as any,
-    decoder: null as any,
-    model: {
-      type: "BPE",
-      dropout: null as any,
-      unk_token: "<unk>",
-      continuing_subword_prefix: null as any,
-      end_of_word_suffix: null as any,
-      fuse_unk: false,
-      byte_fallback: false,
-      ignore_merges: false,
-      vocab: {
-        "<unk>": 0,
-        "<s>": 1,
-        "</s>": 2,
-        "<pad>": 3,
-        "a": 4,
-        "b": 5,
-        "c": 6,
-        "ab": 7,
-        "bc": 8,
-      },
-      merges: [
-        ["a", "b"],
-        ["b", "c"],
-      ] as any[],
-    },
-  };
-
-  const tokenizerConfig = {
-    add_bos_token: false,
-    add_prefix_space: false,
-    added_tokens_decoder: Object.fromEntries(
-      addedTokens.map(token => [
-        String(token.id),
-        { id: token.id, content: token.content, special: token.special },
-      ])
-    ),
-    bos_token: "<s>",
-    clean_up_tokenization_spaces: false,
-    eos_token: "</s>",
-    legacy: true,
-    model_max_length: 1000000000000000,
-    pad_token: "<pad>",
-    sp_model_kwargs: {},
-    spaces_between_special_tokens: false,
-    tokenizer_class: "LlamaTokenizer",
-    unk_token: "<unk>",
-    use_default_system_prompt: false,
-  };
-
-  let tokenizer: Tokenizer;
-
-  beforeAll(() => {
-    tokenizer = new Tokenizer(tokenizerJson, tokenizerConfig);
-  });
-
-  describe("token_to_id", () => {
-    test("should return correct ID for regular token", () => {
-      expect(tokenizer.token_to_id("a")).toBe(4);
-      expect(tokenizer.token_to_id("b")).toBe(5);
-      expect(tokenizer.token_to_id("c")).toBe(6);
-    });
-
-    test("should return correct ID for merged token", () => {
-      expect(tokenizer.token_to_id("ab")).toBe(7);
-      expect(tokenizer.token_to_id("bc")).toBe(8);
-    });
-
-    test("should return correct ID for special tokens", () => {
-      expect(tokenizer.token_to_id("<unk>")).toBe(0);
-      expect(tokenizer.token_to_id("<s>")).toBe(1);
-      expect(tokenizer.token_to_id("</s>")).toBe(2);
-      expect(tokenizer.token_to_id("<pad>")).toBe(3);
-    });
-
-    test("should return undefined for non-existing token", () => {
-      expect(tokenizer.token_to_id("xyz")).toBeUndefined();
-    });
-  });
-
-  describe("id_to_token", () => {
-    test("should return correct token for regular token ID", () => {
-      expect(tokenizer.id_to_token(4)).toBe("a");
-      expect(tokenizer.id_to_token(5)).toBe("b");
-      expect(tokenizer.id_to_token(6)).toBe("c");
-    });
-
-    test("should return correct token for merged token ID", () => {
-      expect(tokenizer.id_to_token(7)).toBe("ab");
-      expect(tokenizer.id_to_token(8)).toBe("bc");
-    });
-
-    test("should return correct token for special token ID", () => {
-      expect(tokenizer.id_to_token(0)).toBe("<unk>");
-      expect(tokenizer.id_to_token(1)).toBe("<s>");
-      expect(tokenizer.id_to_token(2)).toBe("</s>");
-      expect(tokenizer.id_to_token(3)).toBe("<pad>");
-    });
-
-    test("should return undefined for non-existing ID", () => {
-      expect(tokenizer.id_to_token(999)).toBeUndefined();
-    });
-  });
-
-  describe("get_added_tokens_decoder", () => {
-    test("should return a Map", () => {
-      const decoder = tokenizer.get_added_tokens_decoder();
-      expect(decoder).toBeInstanceOf(Map);
-    });
-
-    test("should contain all special tokens", () => {
-      const decoder = tokenizer.get_added_tokens_decoder();
-      expect(decoder.size).toBe(4);
-      expect(decoder.has(0)).toBe(true);
-      expect(decoder.has(1)).toBe(true);
-      expect(decoder.has(2)).toBe(true);
-      expect(decoder.has(3)).toBe(true);
-    });
-
-    test("should return AddedToken objects with correct properties", () => {
-      const decoder = tokenizer.get_added_tokens_decoder();
-      const unkToken = decoder.get(0);
-      expect(unkToken).toBeDefined();
-      expect(unkToken?.content).toBe("<unk>");
-      expect(unkToken?.special).toBe(true);
-      expect(unkToken).toBeInstanceOf(AddedToken);
-
-      const bosToken = decoder.get(1);
-      expect(bosToken?.content).toBe("<s>");
-      expect(bosToken?.special).toBe(true);
-    });
-
-    test("should not contain regular tokens", () => {
-      const decoder = tokenizer.get_added_tokens_decoder();
-      expect(decoder.has(4)).toBe(false);
-      expect(decoder.has(5)).toBe(false);
-      expect(decoder.has(6)).toBe(false);
-    });
-  });
-
-  describe("roundtrip conversions", () => {
-    test("token_to_id and id_to_token should be inverse operations", () => {
-      const tokens = ["a", "b", "c", "ab", "bc", "<unk>", "<s>", "</s>", "<pad>"];
-      
-      for (const token of tokens) {
-        const id = tokenizer.token_to_id(token);
-        expect(id).toBeDefined();
-        const tokenBack = tokenizer.id_to_token(id!);
-        expect(tokenBack).toBe(token);
-      }
-    });
-  });
-});
diff --git a/tests/tokenizers.test.ts b/tests/tokenizers.test.ts
index 387aacc..9980b46 100644
--- a/tests/tokenizers.test.ts
+++ b/tests/tokenizers.test.ts
@@ -1,5 +1,5 @@
 import fetchConfigById from "./utils/fetchConfigById";
-import { Tokenizer } from "../src";
+import { Tokenizer, AddedToken } from "../src";
 import collectTests from "./utils/collectTests";
 
 const TOKENIZER_TESTS = await collectTests();
@@ -43,3 +43,203 @@ describe("Tokenizers (model-specific)", () => {
     });
   }
 });
+
+describe("Tokenizer methods", () => {
+  // Create a simple BPE tokenizer for testing
+  // Vocab size: 10 tokens
+  // - 3 special tokens: <s>, </s>, <pad>
+  // - 1 unk token: <unk>
+  // - 5 regular tokens: a, b, c, ab, bc
+  // - 1 non-special added token: "<added>"
+  const unk_token = "<unk>";
+  const bos_token = "<s>";
+  const eos_token = "</s>";
+  const pad_token = "<pad>";
+  const added_token = "<added>";
+
+  const added_tokens = [
+    new AddedToken({
+      id: 0,
+      content: unk_token,
+      special: true,
+    }),
+    new AddedToken({
+      id: 1,
+      content: bos_token,
+      special: true,
+    }),
+    new AddedToken({
+      id: 2,
+      content: eos_token,
+      special: true,
+    }),
+    new AddedToken({
+      id: 3,
+      content: pad_token,
+      special: true,
+    }),
+    new AddedToken({
+      id: 9,
+      content: added_token,
+      special: false, // regular added token
+    }),
+  ];
+
+  const tokenizerJson = {
+    version: "1.0",
+    truncation: null,
+    padding: null,
+    added_tokens,
+    normalizer: null,
+    pre_tokenizer: null,
+    post_processor: null,
+    decoder: null,
+    model: {
+      type: "BPE",
+      dropout: null,
+      unk_token,
+      continuing_subword_prefix: null,
+      end_of_word_suffix: null,
+      fuse_unk: false,
+      byte_fallback: false,
+      ignore_merges: false,
+      vocab: {
+        [unk_token]: 0,
+        [bos_token]: 1,
+        [eos_token]: 2,
+        [pad_token]: 3,
+        a: 4,
+        b: 5,
+        c: 6,
+        ab: 7,
+        bc: 8,
+      },
+      merges: [
+        ["a", "b"],
+        ["b", "c"],
+      ],
+    },
+  } as any;
+
+  const tokenizerConfig = {
+    add_bos_token: false,
+    add_prefix_space: false,
+    added_tokens_decoder: Object.fromEntries(added_tokens.map((token) => [String(token.id), { id: token.id, content: token.content, special: token.special }])),
+    bos_token,
+    clean_up_tokenization_spaces: false,
+    eos_token,
+    legacy: true,
+    model_max_length: 1000000000000000,
+    pad_token,
+    sp_model_kwargs: {},
+    spaces_between_special_tokens: false,
+    tokenizer_class: "LlamaTokenizer",
+    unk_token,
+  };
+
+  let tokenizer: Tokenizer;
+
+  beforeAll(() => {
+    tokenizer = new Tokenizer(tokenizerJson, tokenizerConfig);
+  });
+
+  describe("token_to_id", () => {
+    test("should return correct ID for regular token", () => {
+      expect(tokenizer.token_to_id("a")).toBe(4);
+      expect(tokenizer.token_to_id("b")).toBe(5);
+      expect(tokenizer.token_to_id("c")).toBe(6);
+    });
+
+    test("should return correct ID for merged token", () => {
+      expect(tokenizer.token_to_id("ab")).toBe(7);
+      expect(tokenizer.token_to_id("bc")).toBe(8);
+    });
+
+    test("should return correct ID for special tokens", () => {
+      expect(tokenizer.token_to_id(unk_token)).toBe(0);
+      expect(tokenizer.token_to_id(bos_token)).toBe(1);
+      expect(tokenizer.token_to_id(eos_token)).toBe(2);
+      expect(tokenizer.token_to_id(pad_token)).toBe(3);
+      expect(tokenizer.token_to_id(added_token)).toBe(9);
+    });
+
+    test("should return undefined for non-existing token", () => {
+      expect(tokenizer.token_to_id("xyz")).toBeUndefined();
+    });
+  });
+
+  describe("id_to_token", () => {
+    test("should return correct token for regular token ID", () => {
+      expect(tokenizer.id_to_token(4)).toBe("a");
+      expect(tokenizer.id_to_token(5)).toBe("b");
+      expect(tokenizer.id_to_token(6)).toBe("c");
+    });
+
+    test("should return correct token for merged token ID", () => {
+      expect(tokenizer.id_to_token(7)).toBe("ab");
+      expect(tokenizer.id_to_token(8)).toBe("bc");
+    });
+
+    test("should return correct token for special/added token ID", () => {
+      expect(tokenizer.id_to_token(0)).toBe(unk_token);
+      expect(tokenizer.id_to_token(1)).toBe(bos_token);
+      expect(tokenizer.id_to_token(2)).toBe(eos_token);
+      expect(tokenizer.id_to_token(3)).toBe(pad_token);
+      expect(tokenizer.id_to_token(9)).toBe(added_token);
+    });
+
+    test("should return undefined for non-existing ID", () => {
+      expect(tokenizer.id_to_token(999)).toBeUndefined();
+    });
+  });
+
+  describe("get_added_tokens_decoder", () => {
+    test("should return a Map", () => {
+      const decoder = tokenizer.get_added_tokens_decoder();
+      expect(decoder).toBeInstanceOf(Map);
+    });
+
+    test("should contain all special tokens", () => {
+      const decoder = tokenizer.get_added_tokens_decoder();
+      expect(decoder.size).toBe(5);
+      expect(decoder.has(0)).toBe(true);
+      expect(decoder.has(1)).toBe(true);
+      expect(decoder.has(2)).toBe(true);
+      expect(decoder.has(3)).toBe(true);
+      expect(decoder.has(9)).toBe(true);
+    });
+
+    test("should return AddedToken objects with correct properties", () => {
+      const decoder = tokenizer.get_added_tokens_decoder();
+      const unkToken = decoder.get(0);
+      expect(unkToken).toBeDefined();
+      expect(unkToken?.content).toBe(unk_token);
+      expect(unkToken?.special).toBe(true);
+      expect(unkToken).toBeInstanceOf(AddedToken);
+
+      const bosToken = decoder.get(1);
+      expect(bosToken?.content).toBe(bos_token);
+      expect(bosToken?.special).toBe(true);
+    });
+
+    test("should not contain regular tokens", () => {
+      const decoder = tokenizer.get_added_tokens_decoder();
+      expect(decoder.has(4)).toBe(false);
+      expect(decoder.has(5)).toBe(false);
+      expect(decoder.has(6)).toBe(false);
+    });
+  });
+
+  describe("roundtrip conversions", () => {
+    test("token_to_id and id_to_token should be inverse operations", () => {
+      const tokens = [unk_token, bos_token, eos_token, pad_token, "a", "b", "c", "ab", "bc", added_token];
+
+      for (const token of tokens) {
+        const id = tokenizer.token_to_id(token);
+        expect(id).toBeDefined();
+        const tokenBack = tokenizer.id_to_token(id!);
+        expect(tokenBack).toBe(token);
+      }
+    });
+  });
+});

From 6daa23c3566ba0a8374017c12f5d924e7d481f09 Mon Sep 17 00:00:00 2001
From: Joshua Lochner <26504141+xenova@users.noreply.github.com>
Date: Thu, 6 Nov 2025 17:32:42 -0500
Subject: [PATCH 6/6] Formatting

---
 src/core/PreTokenizer.ts                        | 10 ++++++++--
 src/core/Tokenizer.ts                           |  6 +++++-
 src/core/decoder/create_decoder.ts              |  1 -
 src/core/normalizer/create_normalizer.ts        |  1 -
 src/core/postProcessor/create_post_processor.ts |  1 -
 5 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/core/PreTokenizer.ts b/src/core/PreTokenizer.ts
index 02cf93c..c87ec79 100644
--- a/src/core/PreTokenizer.ts
+++ b/src/core/PreTokenizer.ts
@@ -6,7 +6,10 @@ import type { PreTokenizeTextOptions } from "@static/tokenizer";
  * A callable class representing a pre-tokenizer used in tokenization. Subclasses
  * should implement the `pre_tokenize_text` method to define the specific pre-tokenization logic.
  */
-abstract class PreTokenizer extends Callable<[string | string[], any?], string[]> {
+abstract class PreTokenizer extends Callable<
+  [string | string[], any?],
+  string[]
+> {
   /**
    * Method that should be implemented by subclasses to define the specific pre-tokenization logic.
    *
@@ -14,7 +17,10 @@ abstract class PreTokenizer extends Callable<[string | string[], any?], string[]
    * @param options Additional options for the pre-tokenization logic.
    * @returns The pre-tokenized text.
    */
-  abstract pre_tokenize_text(text: string, options?: PreTokenizeTextOptions): string[];
+  abstract pre_tokenize_text(
+    text: string,
+    options?: PreTokenizeTextOptions,
+  ): string[];
 
   /**
    * Tokenizes the given text into pre-tokens.
diff --git a/src/core/Tokenizer.ts b/src/core/Tokenizer.ts
index fc6ca22..37b9772 100644
--- a/src/core/Tokenizer.ts
+++ b/src/core/Tokenizer.ts
@@ -19,7 +19,11 @@ import type PreTokenizer from "./PreTokenizer";
 import type TokenizerModel from "./TokenizerModel";
 import type PostProcessor from "./PostProcessor";
 import type Decoder from "./Decoder";
-import type { TokenConfig, TokenizerConfig, TokenizerJSON } from "@static/tokenizer";
+import type {
+  TokenConfig,
+  TokenizerConfig,
+  TokenizerJSON,
+} from "@static/tokenizer";
 
 interface EncodeOptions {
   text_pair?: string | null;
diff --git a/src/core/decoder/create_decoder.ts b/src/core/decoder/create_decoder.ts
index 2775dfb..4abccc9 100644
--- a/src/core/decoder/create_decoder.ts
+++ b/src/core/decoder/create_decoder.ts
@@ -1,4 +1,3 @@
-
 import ByteLevel from "./ByteLevel";
 import WordPiece from "./WordPiece";
 import Metaspace from "./Metaspace";
diff --git a/src/core/normalizer/create_normalizer.ts b/src/core/normalizer/create_normalizer.ts
index 14c6e5a..fbe3682 100644
--- a/src/core/normalizer/create_normalizer.ts
+++ b/src/core/normalizer/create_normalizer.ts
@@ -1,4 +1,3 @@
-
 import BertNormalizer from "./BertNormalizer";
 import Precompiled from "./Precompiled";
 import Sequence from "./Sequence";
diff --git a/src/core/postProcessor/create_post_processor.ts b/src/core/postProcessor/create_post_processor.ts
index c960163..06cee45 100644
--- a/src/core/postProcessor/create_post_processor.ts
+++ b/src/core/postProcessor/create_post_processor.ts
@@ -1,4 +1,3 @@
-
 import TemplateProcessing from "./TemplateProcessing";
 import ByteLevel from "./ByteLevel";
 import BertProcessing from "./BertProcessing";