From c8753842cca44a505b8730aa4a41e97eae27e9ae Mon Sep 17 00:00:00 2001 From: Pedro Cuenca Date: Thu, 11 Sep 2025 19:35:57 +0200 Subject: [PATCH 1/3] Fix legacy behaviour. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There was a case I had not considered 😢 Fixes #96 --- Sources/Tokenizers/Tokenizer.swift | 22 +++++++++++++++++++--- Tests/TokenizersTests/TokenizerTests.swift | 11 +++++++++++ 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/Sources/Tokenizers/Tokenizer.swift b/Sources/Tokenizers/Tokenizer.swift index e1fe2530..cd69fb97 100644 --- a/Sources/Tokenizers/Tokenizer.swift +++ b/Sources/Tokenizers/Tokenizer.swift @@ -278,9 +278,9 @@ public class PreTrainedTokenizer: Tokenizer { public var unknownTokenId: Int? { model.unknownTokenId } public var fuseUnknownTokens: Bool { model.fuseUnknownTokens } - private let addedTokens: Set - private let specialTokens: [String: Int] - private let addedTokensRegex: NSRegularExpression? + let addedTokens: Set + let specialTokens: [String: Int] + let addedTokensRegex: NSRegularExpression? private let preTokenizer: PreTokenizer? private let normalizer: Normalizer? @@ -721,4 +721,20 @@ class LlamaPreTrainedTokenizer: PreTrainedTokenizer { let updatedData = Config(configDictionary) try super.init(tokenizerConfig: tokenizerConfig, tokenizerData: updatedData, strict: strict) } + + /// If `self.legacy` is set to `False`, a prefix token is added unless the first token is special. + /// https://github.com/huggingface/transformers/blob/e6dcf8abd6f65bb4b6dfc1831b20d9ba49ce00e2/src/transformers/models/t5/tokenization_t5.py#L374-L387 + override func tokenize(text: String) -> [String] { + if isLegacy || text.isEmpty { + return super.tokenize(text: text) + } + + let tokens = super.tokenize(text: sentencePieceUnderline + text.replacingOccurrences(of: sentencePieceUnderline, with: " ")) + + let second = tokens.dropFirst().first + if tokens.first == sentencePieceUnderline, second != nil, specialTokens[second!] != nil { + return Array(tokens[1...]) + } + return tokens + } } diff --git a/Tests/TokenizersTests/TokenizerTests.swift b/Tests/TokenizersTests/TokenizerTests.swift index 821df186..62ea3a0d 100644 --- a/Tests/TokenizersTests/TokenizerTests.swift +++ b/Tests/TokenizersTests/TokenizerTests.swift @@ -118,6 +118,17 @@ class PhiSimpleTests: XCTestCase { XCTAssertEqual(tokenizer.encode(text: "hello world"), [15339, 1917]) XCTAssertEqual(tokenizer.encode(text: "<|im_start|>user<|im_sep|>Who are you?<|im_end|><|im_start|>assistant<|im_sep|>"), [100264, 882, 100266, 15546, 527, 499, 30, 100265, 100264, 78191, 100266]) } + + /// https://github.com/huggingface/swift-transformers/issues/96 + func testLegacyLlamaBehaviour() async throws { + guard let tokenizer = try await AutoTokenizer.from(pretrained: "mlx-community/Phi-3-mini-4k-instruct-4bit-no-q-embed") as? PreTrainedTokenizer else { + XCTFail() + return + } + + let inputIds = tokenizer(" Hi") + XCTAssertEqual(inputIds, [1, 29871, 6324]) + } } class UnregisteredTokenizerTests: XCTestCase { From 46777a57f763cd2de171a4a8c5601d011a7cfb5c Mon Sep 17 00:00:00 2001 From: Pedro Cuenca Date: Thu, 11 Sep 2025 19:38:24 +0200 Subject: [PATCH 2/3] Update Sources/Tokenizers/Tokenizer.swift --- Sources/Tokenizers/Tokenizer.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Sources/Tokenizers/Tokenizer.swift b/Sources/Tokenizers/Tokenizer.swift index cd69fb97..58fa1cff 100644 --- a/Sources/Tokenizers/Tokenizer.swift +++ b/Sources/Tokenizers/Tokenizer.swift @@ -722,7 +722,7 @@ class LlamaPreTrainedTokenizer: PreTrainedTokenizer { try super.init(tokenizerConfig: tokenizerConfig, tokenizerData: updatedData, strict: strict) } - /// If `self.legacy` is set to `False`, a prefix token is added unless the first token is special. + /// If `isLegacy` is `False`, a prefix token is added unless the first token is special. /// https://github.com/huggingface/transformers/blob/e6dcf8abd6f65bb4b6dfc1831b20d9ba49ce00e2/src/transformers/models/t5/tokenization_t5.py#L374-L387 override func tokenize(text: String) -> [String] { if isLegacy || text.isEmpty { From 69f20f42fa58c1bd19f8fce03486713e3300a3cb Mon Sep 17 00:00:00 2001 From: Pedro Cuenca Date: Fri, 12 Sep 2025 09:31:09 +0200 Subject: [PATCH 3/3] No force unwrap --- Sources/Tokenizers/Tokenizer.swift | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Sources/Tokenizers/Tokenizer.swift b/Sources/Tokenizers/Tokenizer.swift index cd69fb97..9e66d660 100644 --- a/Sources/Tokenizers/Tokenizer.swift +++ b/Sources/Tokenizers/Tokenizer.swift @@ -730,9 +730,7 @@ class LlamaPreTrainedTokenizer: PreTrainedTokenizer { } let tokens = super.tokenize(text: sentencePieceUnderline + text.replacingOccurrences(of: sentencePieceUnderline, with: " ")) - - let second = tokens.dropFirst().first - if tokens.first == sentencePieceUnderline, second != nil, specialTokens[second!] != nil { + if tokens.first == sentencePieceUnderline, let second = tokens.dropFirst().first, specialTokens[second] != nil { return Array(tokens[1...]) } return tokens