AddedVocabulary - Add tests, update bindings + various tweaks

huggingface · Jun 18, 2020 · 7cedb13 · 7cedb13
1 parent c6f633e
commit 7cedb13
Show file tree

Hide file tree

Showing 12 changed files with 327 additions and 91 deletions.
diff --git a/bindings/node/lib/bindings/tokenizer.d.ts b/bindings/node/lib/bindings/tokenizer.d.ts
@@ -392,6 +392,15 @@ export interface AddedTokenOptions {
    * @default False
    */
   singleWord?: boolean;
+  /**
+   * Whether this token should match on the normalized version of the text. For example
+   * with the added token `yesterday` and a normalizer in charge of lowercasing the text,
+   * the input `I saw a lion Yesterday` would match the token.
+   * This is False for special tokens by default, true otherwise
+   * @default True
+   */
+  normalized?: boolean;
+
 }
 
 /**
@@ -404,9 +413,10 @@ export class AddedToken {
   /**
    * Instantiate a new AddedToken
    * @param content The content of the token
+   * @param special Whether this is a special token
    * @param [options] Options for the token
    */
-  constructor(content: string, options?: AddedTokenOptions);
+  constructor(content: string, special: boolean, options?: AddedTokenOptions);
 
   /**
    * Get the content of the AddedToken

diff --git a/bindings/node/lib/bindings/tokenizer.test.ts b/bindings/node/lib/bindings/tokenizer.test.ts
@@ -32,17 +32,17 @@ import {
 
 describe("AddedToken", () => {
   it("instantiates with only content", () => {
-    const addToken = new AddedToken("test");
+    const addToken = new AddedToken("test", false);
     expect(addToken.constructor.name).toEqual("AddedToken");
   });
 
   it("instantiates with empty options", () => {
-    const addToken = new AddedToken("test", {});
+    const addToken = new AddedToken("test", false, {});
     expect(addToken.constructor.name).toEqual("AddedToken");
   });
 
   it("instantiates with options", () => {
-    const addToken = new AddedToken("test", {
+    const addToken = new AddedToken("test", false, {
       leftStrip: true,
       rightStrip: true,
       singleWord: true
@@ -52,7 +52,7 @@ describe("AddedToken", () => {
 
   describe("getContent", () => {
     it("returns the string content of AddedToken", () => {
-      const addedToken = new AddedToken("test");
+      const addedToken = new AddedToken("test", false);
       expect(addedToken.getContent()).toEqual("test");
     });
   });
@@ -107,7 +107,7 @@ describe("Tokenizer", () => {
     it("accepts a list of AddedToken as new tokens when initial model is empty", () => {
       const model = BPE.empty();
       const tokenizer = new Tokenizer(model);
-      const addedToken = new AddedToken("test");
+      const addedToken = new AddedToken("test", false);
 
       const nbAdd = tokenizer.addTokens([addedToken]);
       expect(nbAdd).toBe(1);
@@ -132,7 +132,7 @@ describe("Tokenizer", () => {
 
       const model = BPE.empty();
       tokenizer = new Tokenizer(model);
-      tokenizer.addTokens(["my", "name", "is", "john", new AddedToken("pair")]);
+      tokenizer.addTokens(["my", "name", "is", "john", new AddedToken("pair", false)]);
 
       encode = promisify(tokenizer.encode.bind(tokenizer));
       encodeBatch = promisify(tokenizer.encodeBatch.bind(tokenizer));

diff --git a/bindings/node/native/src/tokenizer.rs b/bindings/node/native/src/tokenizer.rs
@@ -30,10 +30,11 @@ struct AddedTokenOptions {
     singleWord: Option<bool>,
     leftStrip: Option<bool>,
     rightStrip: Option<bool>,
+    normalized: Option<bool>,
 }
 impl AddedTokenOptions {
-    fn into_added_token(self, content: String) -> tk::AddedToken {
-        let mut token = tk::AddedToken::from(content);
+    fn into_added_token(self, content: String, special: bool) -> tk::AddedToken {
+        let mut token = tk::AddedToken::from(content, special);
         if let Some(sw) = self.singleWord {
             token = token.single_word(sw);
         }
@@ -43,6 +44,9 @@ impl AddedTokenOptions {
         if let Some(rs) = self.rightStrip {
             token = token.rstrip(rs);
         }
+        if let Some(n) = self.normalized {
+            token = token.normalized(n);
+        }
         token
     }
 }
@@ -52,18 +56,20 @@ declare_types! {
         init(mut cx) {
             // init(
             //  content: string,
+            //  special: boolean,
             //  options?: {
             //    singleWord?: boolean = false,
             //    leftStrip?: boolean = false,
             //    rightStrip?: boolean = false
+            //    normalized?: boolean = true,
             //  }
             // )
 
-            let content = cx.extract::<String>(0)
-                .map_err(|_| Error("First argument must be string".into()))?;
-            let token = cx.extract_opt::<AddedTokenOptions>(1)?
+            let content = cx.extract::<String>(0)?;
+            let special = cx.extract::<bool>(1)?;
+            let token = cx.extract_opt::<AddedTokenOptions>(2)?
                 .unwrap_or_else(AddedTokenOptions::default)
-                .into_added_token(content);
+                .into_added_token(content, special);
 
             Ok(AddedToken { token })
         }
@@ -87,7 +93,7 @@ impl FromJsValue for AddedToken {
     fn from_value<'c, C: Context<'c>>(from: Handle<'c, JsValue>, cx: &mut C) -> LibResult<Self> {
         if let Ok(token) = from.downcast::<JsString>() {
             Ok(AddedToken {
-                token: tk::AddedToken::from(token.value()),
+                token: tk::AddedToken::from(token.value(), false),
             })
         } else if let Ok(token) = from.downcast::<JsAddedToken>() {
             let guard = cx.lock();
@@ -99,6 +105,21 @@ impl FromJsValue for AddedToken {
     }
 }
 
+struct SpecialToken(tk::AddedToken);
+impl FromJsValue for SpecialToken {
+    fn from_value<'c, C: Context<'c>>(from: Handle<'c, JsValue>, cx: &mut C) -> LibResult<Self> {
+        if let Ok(token) = from.downcast::<JsString>() {
+            Ok(SpecialToken(tk::AddedToken::from(token.value(), true)))
+        } else if let Ok(token) = from.downcast::<JsAddedToken>() {
+            let guard = cx.lock();
+            let token = token.borrow(&guard);
+            Ok(SpecialToken(token.token.clone()))
+        } else {
+            Err(Error("Expected `string | AddedToken`".into()))
+        }
+    }
+}
+
 // encode & encodeBatch types
 
 struct TextInputSequence(tk::InputSequence);
@@ -623,7 +644,7 @@ declare_types! {
 
             let this = cx.this();
             let guard = cx.lock();
-            let token = this.borrow(&guard).tokenizer.id_to_token(id);
+            let token = this.borrow(&guard).tokenizer.id_to_token(id).map(|t| t.to_owned());
 
             if let Some(token) = token {
                 Ok(cx.string(token).upcast())
@@ -650,9 +671,9 @@ declare_types! {
         method addSpecialTokens(mut cx) {
             // addSpecialTokens(tokens: (string | AddedToken)[]): number
 
-            let tokens = cx.extract_vec::<AddedToken>(0)?
+            let tokens = cx.extract_vec::<SpecialToken>(0)?
                 .into_iter()
-                .map(|token| token.into())
+                .map(|token| token.0)
                 .collect::<Vec<_>>();
 
             let mut this = cx.this();

diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs
@@ -29,7 +29,7 @@ impl AddedToken {
     #[new]
     #[args(kwargs = "**")]
     fn new(content: &str, is_special_token: bool, kwargs: Option<&PyDict>) -> PyResult<Self> {
-        let mut token = tk::tokenizer::AddedToken::from(content.to_owned(), is_special_token);
+        let mut token = tk::tokenizer::AddedToken::from(content, is_special_token);
 
         if let Some(kwargs) = kwargs {
             for (key, value) in kwargs {

diff --git a/bindings/python/tokenizers/__init__.pyi b/bindings/python/tokenizers/__init__.pyi
@@ -200,27 +200,44 @@ class AddedToken:
     """
 
     def __new__(
-        cls, content: str, single_word: bool = False, lstrip: bool = False, rstrip: bool = False,
+        cls,
+        content: str,
+        is_special_token: bool,
+        single_word: bool = False,
+        lstrip: bool = False,
+        rstrip: bool = False,
+        normalized: bool = True,
     ) -> AddedToken:
         """ Instantiate a new AddedToken
 
         Args:
             content: str:
                 The content of the token
 
+            is_special_token: bool:
+                Whether this token is a special token. This has an impact on the default value for
+                `normalized` which is False for special tokens, but True for others.
+
             single_word: bool
-                Whether this token should only match against single word. If True,
-                this token will never match inside of a word.
+                Whether this token should only match against single words. If True,
+                this token will never match inside of a word. For example the token `ing` would
+                match on `tokenizing` if this option if False, but not if this option is True.
 
             lstrip: bool
                 Whether this token should strip all potential whitespaces on the left side.
-                If True, this token will greedily match any whitespace on the left and then strip
-                them out.
+                If True, this token will greedily match any whitespace on the left. For example,
+                if we try to match the token `[MASK]` with lstrip=True, in the text `I saw a [MASK]`
+                we will match on ` [MASK]`.
 
             rstrip: bool
                 Whether this token should strip all potential whitespaces on the right side.
-                If True, this token will greedily match any whitespace on the right and then strip
-                them out.
+                If True, this token will greedily match any whitespace on the right. It works just
+                like lstrip, but on the right.
+
+            normalized: bool:
+                Whether this token should be match the normalized version of the input text. For
+                example, with the added token `yesterday` and a normalizer in charge of lowercasing
+                the text, the token could be extract from the input `I saw a lion Yesterday`.
         """
         pass
 

diff --git a/tokenizers/README.md b/tokenizers/README.md
@@ -9,7 +9,7 @@
         <img alt="GitHub" src="https://img.shields.io/github/license/huggingface/tokenizers.svg?color=blue">
     </a>
     <a href="https://docs.rs/tokenizers/">
-        <img alt="Doc" src="https://docs.rs/tokenizers/badge.svg">    
+        <img alt="Doc" src="https://docs.rs/tokenizers/badge.svg">
     </a>
 </p>
 <br>
@@ -56,22 +56,22 @@ fn main() -> Result<()>{
             .vocab_size(vocab_size)
             .min_frequency(0)
             .special_tokens(vec![
-                AddedToken::from("<s>".into()),
-                AddedToken::from("<pad>".into()),
-                AddedToken::from("</s>".into()),
-                AddedToken::from("<unk>".into()),
-                AddedToken::from("<mask>".into()),
+                AddedToken::from("<s>", true),
+                AddedToken::from("<pad>", true),
+                AddedToken::from("</s>", true),
+                AddedToken::from("<unk>", true),
+                AddedToken::from("<mask>", true),
             ])
             .build(),
     );
-                                                                  
+
     let mut tokenizer = Tokenizer::new(Box::new(BPE::default()));
     tokenizer.with_normalizer(Box::new(Sequence::new(vec![
         Box::new(Strip::new(true, true)),
         Box::new(NFC),
     ])));
     tokenizer.with_pre_tokenizer(Box::new(ByteLevel::default()));
-                                                                  
+
     tokenizer.train(&trainer, vec!["/path/to/train.txt".to_string()])?;
     tokenizer.save("/path/to/trained_tokenizer", true)?;
 
@@ -86,7 +86,7 @@ use tokenizers::Result;
 use tokenizers::tokenizer::Tokenizer;
 
 fn main() -> Result<()>{
-                                                                  
+
     let tokenizer = Tokenizer::from_file("/path/to/trained_tokenizer")?;
 
     let sample_encoding = tokenizer.encode("Huggingface", false)?;

diff --git a/tokenizers/benches/bpe_benchmark.rs b/tokenizers/benches/bpe_benchmark.rs
@@ -17,9 +17,8 @@ fn create_gpt2_tokenizer(bpe: BPE) -> Tokenizer {
     let mut tokenizer = Tokenizer::new(Box::new(bpe));
     tokenizer.with_pre_tokenizer(Box::new(ByteLevel::default()));
     tokenizer.with_decoder(Box::new(ByteLevel::default()));
-    tokenizer.add_tokens(&[AddedToken::from(String::from("ing"), false).single_word(false)]);
-    tokenizer
-        .add_special_tokens(&[AddedToken::from(String::from("[ENT]"), true).single_word(true)]);
+    tokenizer.add_tokens(&[AddedToken::from("ing", false).single_word(false)]);
+    tokenizer.add_special_tokens(&[AddedToken::from("[ENT]", true).single_word(true)]);
     tokenizer
 }