Fix tokenization space cleanup (#215)

inseq-team · Aug 9, 2023 · f8e55f8 · f8e55f8
1 parent 33aa13b
commit f8e55f8
Showing 1 changed file with 6 additions and 5 deletions.
diff --git a/inseq/models/huggingface_model.py b/inseq/models/huggingface_model.py
@@ -215,10 +215,7 @@ def generate(
             **kwargs,
         )
         sequences = generation_out.sequences
-        texts = self.tokenizer.batch_decode(
-            sequences,
-            skip_special_tokens=True,
-        )
+        texts = self.decode(ids=sequences, skip_special_tokens=True)
         if return_generation_output:
             return texts, generation_out
         return texts
@@ -295,7 +292,11 @@ def decode(
         ids: Union[List[int], List[List[int]], IdsTensor],
         skip_special_tokens: bool = True,
     ) -> List[str]:
-        return self.tokenizer.batch_decode(ids, skip_special_tokens=skip_special_tokens)
+        return self.tokenizer.batch_decode(
+            ids,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=False,
+        )
 
     def embed_ids(self, ids: IdsTensor, as_targets: bool = False) -> EmbeddingsTensor:
         if as_targets and not self.is_encoder_decoder: