In [31]:
# import sentencepiece as spm
# s = spm.SentencePieceProcessor(model_file='spm.model')
# for n in range(5):
#     s.encode('New York', out_type=str, enable_sampling=True, alpha=0.1, nbest=-1)


In [14]:
# import os
# os.listdir('../../data/wiki.train.tokens')

### Train SentencePiece

In [18]:
import sentencepiece as spm
params = ('--input=../../data/wiki.train.tokens --model_prefix=spm --vocab_size=5000')
spm.SentencePieceTrainer.train(params)

True

### Use SentencePiece

In [29]:
import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.Load('spm.model')

print( sp.EncodeAsPieces(input='New York') )
print( sp.EncodeAsIds(input='New York') )
print( sp.DecodeIds([195, 467]) )

['▁New', '▁York']
[195, 467]
New York


In [61]:
sp.DecodePieces(['▁New', '▁York'])

'New York'

In [65]:
sp.PieceToId('▁New')

195

In [None]:
sp.IdToPiece('▁New')

A huge appeal of using SentencePiece is subword regularization. To enable this, we must use `sample_encode_as_pieces` or `SampleEncodeAsIds`

In [39]:
for n in range(5):
#     print( sp.SampleEncodeAsIds('New York', alpha=0.1, nbest_size=-1) )
    print( sp.sample_encode_as_pieces('New York', alpha=0.1, nbest_size=-1) )

['▁', 'N', 'e', 'w', '▁York']
['▁New', '▁York']
['▁N', 'e', 'w', '▁York']
['▁New', '▁', 'Y', 'or', 'k']
['▁New', '▁York']


### Current FullTokenizer
```
class FullTokenizer(object):
    """Runs end-to-end tokenziation."""

    def __init__(self, vocab_file, do_lower_case=True):
        self.vocab = load_vocab(vocab_file)
        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)

    def tokenize(self, text):
        split_tokens = []
        for token in self.basic_tokenizer.tokenize(text):
            for sub_token in self.wordpiece_tokenizer.tokenize(token):
                split_tokens.append(sub_token)

        return split_tokens

    def convert_tokens_to_ids(self, tokens):
        return convert_tokens_to_ids(self.vocab, tokens)

    def convert_ids_to_tokens(self, tokens):
        return convert_ids_to_tokens(self.vocab, tokens)

    def convert_to_unicode(self, text):
        return convert_to_unicode(text)

```

### Creating an abstract class

In [50]:
from abc import ABC, abstractmethod

class tokenizer_abstract_Class(ABC):
    
    @abstractmethod
    def tokenize(self):
        pass
    
    @abstractmethod
    def convert_tokens_to_ids(self):
        pass
    
    @abstractmethod
    def convert_ids_to_tokens(self):
        pass
    
#     def convert_to_unicode(self):
#         return convert_to_unicode(text)
    
import sentencepiece as spm

class SPTokenizer(tokenizer_abstract_Class):
    def __init__(self, model_path='spm.model', nbest_size=-1, alpha=0.1):
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(model_path)
        self.nbest_size=nbest_size
        self.alpha=alpha
        
    def tokenize(self, text):
        return self.sp.sample_encode_as_pieces(text, self.nbest_size, self.alpha)
    
    def convert_tokens_to_ids(self, tokens):
        return [self.sp.PieceToId(token) for token in tokens]
    
    def convert_ids_to_tokens(self, ids):
        return [self.sp.IdToPiece(id_) for id_ in ids]

### Test the tokenizer

In [86]:
string = "New York"

tokenized_string = SPTokenizer('spm.model').tokenize(string) 
print(f"tokenized string: {tokenized_string}")

tokenized_string_id = SPTokenizer('spm.model').convert_tokens_to_ids(tokenized_string)
print(f"tokenized string id: {tokenized_string_id}")

recovered_pieces =  SPTokenizer('spm.model').convert_ids_to_tokens(tokenized_string_id)
print(f"recovered pieces: {recovered_pieces}")

assert tokenized_string == recovered_pieces

tokenized string: ['▁N', 'ew', '▁York']
tokenized string id: [273, 2109, 467]
recovered pieces: ['▁N', 'ew', '▁York']


In [None]:
sp = spm.SentencePieceProcessor()
sp.Load('spm.model')

In [87]:
# sp.sample_encode_as_pieces('the quick brown fox jumped over the lazy dog')