diff --git a/Sources/Tokenizers/Tokenizer.swift b/Sources/Tokenizers/Tokenizer.swift index e1fe2530..fef67abe 100644 --- a/Sources/Tokenizers/Tokenizer.swift +++ b/Sources/Tokenizers/Tokenizer.swift @@ -116,6 +116,7 @@ struct TokenizerModel { "PreTrainedTokenizer": BPETokenizer.self, "Qwen2Tokenizer": BPETokenizer.self, "WhisperTokenizer": BPETokenizer.self, + "XLMRobertaTokenizer": UnigramTokenizer.self, ] static func unknownToken(from tokenizerConfig: Config) -> String? { diff --git a/Tests/TokenizersTests/TokenizerTests.swift b/Tests/TokenizersTests/TokenizerTests.swift index 821df186..72db5a4b 100644 --- a/Tests/TokenizersTests/TokenizerTests.swift +++ b/Tests/TokenizersTests/TokenizerTests.swift @@ -120,6 +120,20 @@ class PhiSimpleTests: XCTestCase { } } +class RobertaTokenizerTests: XCTestCase { + /// https://github.com/huggingface/swift-transformers/issues/99 + func testRobertaXLMTokenizer() async throws { + guard let tokenizer = try await AutoTokenizer.from(pretrained: "intfloat/multilingual-e5-small") as? PreTrainedTokenizer else { + XCTFail() + return + } + + let ids = tokenizer.encode(text: "query: how much protein should a female eat") + let expected = [0, 41, 1294, 12, 3642, 5045, 21308, 5608, 10, 117776, 73203, 2] + XCTAssertEqual(ids, expected) + } +} + class UnregisteredTokenizerTests: XCTestCase { func testNllbTokenizer() async throws { do {