Skip to content

Commit

Permalink
Fix tokenization space cleanup (#215)
Browse files Browse the repository at this point in the history
  • Loading branch information
gsarti committed Aug 9, 2023
1 parent 33aa13b commit f8e55f8
Showing 1 changed file with 6 additions and 5 deletions.
11 changes: 6 additions & 5 deletions inseq/models/huggingface_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,10 +215,7 @@ def generate(
**kwargs,
)
sequences = generation_out.sequences
texts = self.tokenizer.batch_decode(
sequences,
skip_special_tokens=True,
)
texts = self.decode(ids=sequences, skip_special_tokens=True)
if return_generation_output:
return texts, generation_out
return texts
Expand Down Expand Up @@ -295,7 +292,11 @@ def decode(
ids: Union[List[int], List[List[int]], IdsTensor],
skip_special_tokens: bool = True,
) -> List[str]:
return self.tokenizer.batch_decode(ids, skip_special_tokens=skip_special_tokens)
return self.tokenizer.batch_decode(
ids,
skip_special_tokens=skip_special_tokens,
clean_up_tokenization_spaces=False,
)

def embed_ids(self, ids: IdsTensor, as_targets: bool = False) -> EmbeddingsTensor:
if as_targets and not self.is_encoder_decoder:
Expand Down

0 comments on commit f8e55f8

Please sign in to comment.