In [1]:
from fastai import *
from fastai.text import * 

import sentencepiece as spm
from pathlib import *
import shutil


In [2]:
path       = Path("../nlp-data/fr/")
pathSrc    = path/"wiki-train_valid"
pathSrcTxt = path/"wiki-train_valid"/"txt"


In [4]:
vocab_size=32000
use_special_cases = True
pathSrc = pathSrcTxt
pathTrg = pathSrc/"sp-model"
input_sentence_size=int(1e7)
model_type = "unigram"
unk_id,bos_id,eos_id,pad_id = 0,1,2,3
pad_idx=pad_id
rules = text.transform.default_pre_rules+text.transform.default_post_rules                    


special_cases=[ 
#                text.transform.UNK,
#                text.transform.BOS,
#                "xxeos",
#                text.transform.PAD,
                text.transform.TK_MAJ,
                text.transform.TK_UP,
                text.transform.TK_REP,
                text.transform.TK_WREP,
                text.transform.FLD
              ] 


In [4]:
def sentencepiece_prepare(pathSrc:Path, pathTrg:Path, cache_name:str='sp-model',
                          rules:ListRules=text.transform.default_pre_rules,
                        vocab_size:int=30000, 
                        model_type:str='unigram', 
                        input_sentence_size:int=1E7, 
                        pad_idx:int=1):
  
    if not 'sentencepiece' in sys.modules:
        raise Exception('sentencepiece module is missing: run `pip install sentencepiece`')

    tmpDir = Path(pathTrg)/cache_name/"tmp"    
    if tmpDir.exists(): shutil.rmtree(tmpDir, ignore_errors=True)
    tmpDir.mkdir(exist_ok=True)
    
    txt = []
    noSrcParts = len(pathSrc.parts)
    for fn in pathSrc.glob("**/wiki*"):
        with fn.open(encoding='utf-8') as f:
            txt = reduce(lambda t, rule: rule(t), rules, f.read())
            
            if len(txt)>0:
                p = tmpDir.joinpath(*fn.parts[noSrcParts:])
                p.parent.mkdir(exist_ok=True)
                with p.open("w+") as fw:
                    fw.write(txt)

#%time sentencepiece_train(pathSrcTxt, pathSrc, "sp-model", rules=rules vocab_size=100 )

In [5]:
#def sentencepiece_train2():
if True is True:   
    pathTrg.mkdir(exist_ok=True)
    model_prefix = pathTrg / "m"
    
    #Set the following controls to sentencepiece values until there is a release where we can set the token value
    #Note  taku910 has already made the change but the pip of sentencepiewce version has not been updated 
    text.transform.UNK = "<unk>"
    text.transform.BOS = "<s>"
    text.transform.PAD = "<pad>"
    
    #create control ids for the rest of the fastai control tokens in case the user needs them
    #it is the responsibility of fastai to generate and use the control tokens them and apply then before decoding
    #Fx applying TK_MAJ after tokenization would change She to two token TK_MAJ+she.
    #Problem! Sentencepiece would tokenize "Elle" as _Elle so our deal_caps would not catch it
    str_specialcases = ",".join(special_cases) 
    
    pathSrc_list = [str(s) for s in pathSrcTxt.glob("**/wiki*")]
    pathSrc_list= ",".join(pathSrc_list)
    
    
    
    #            f"--unk_id={unk_id} " \
    #            f"--bos_id={bos_id} " \
    #            f"--eos_id={eos_id} " \
    #            f"--pad_id={pad_id} " \
    sp_params = f"--input={pathSrc_list} "  \
                f"--eos_id=-1 " \
                f"--control_symbols={str_specialcases} " \
                f"--character_coverage=1.0 " \
                f"--model_prefix={model_prefix} " \
                f"--vocab_size={vocab_size} " \
                f"--model_type={model_type} " 
    #f"--split_by_number=1 " \
    #f"--input_sentence_size={input_sentence_size} " \
    #control_symbols
    #add_dummy_prefix
    #hard_vocab_limit
    #use_all_vocab
    #--add_dummy_prefix-=false. This is intended as we want to treat words independently from the positions ("foo" and "bar foo" are handled as "_foo" and "_bar_foo" and there are two "_foo").
    print(sp_params)
    spm.SentencePieceTrainer.Train(sp_params)
    
    #return {'tokenizer': tokenizer, 'vocab': vocab}
    #sp_params
#%time sentencepiece_train2()    

--input=../nlp-data/fr/wiki-train_valid/txt/AX/wiki_73,../nlp-data/fr/wiki-train_valid/txt/AX/wiki_87,../nlp-data/fr/wiki-train_valid/txt/AX/wiki_80,../nlp-data/fr/wiki-train_valid/txt/AX/wiki_74,../nlp-data/fr/wiki-train_valid/txt/AX/wiki_89,../nlp-data/fr/wiki-train_valid/txt/AX/wiki_42,../nlp-data/fr/wiki-train_valid/txt/AX/wiki_45,../nlp-data/fr/wiki-train_valid/txt/AX/wiki_11,../nlp-data/fr/wiki-train_valid/txt/AX/wiki_16,../nlp-data/fr/wiki-train_valid/txt/AX/wiki_29,../nlp-data/fr/wiki-train_valid/txt/AX/wiki_20,../nlp-data/fr/wiki-train_valid/txt/AX/wiki_27,../nlp-data/fr/wiki-train_valid/txt/AX/wiki_18,../nlp-data/fr/wiki-train_valid/txt/AX/wiki_44,../nlp-data/fr/wiki-train_valid/txt/AX/wiki_88,../nlp-data/fr/wiki-train_valid/txt/AX/wiki_43,../nlp-data/fr/wiki-train_valid/txt/AX/wiki_75,../nlp-data/fr/wiki-train_valid/txt/AX/wiki_81,../nlp-data/fr/wiki-train_valid/txt/AX/wiki_86,../nlp-data/fr/wiki-train_valid/txt/AX/wiki_72,../nlp-data/fr/wiki-train_valid/txt/AX/wiki_26,../nl

In [9]:
sp = spm.SentencePieceProcessor()
sp.Load(str(pathTrg/"m.model"))
print("1: Size of vocabulary:",sp.GetPieceSize())
print("2:", sp.EncodeAsPieces("Elle est grande."))
print("3:", sp.EncodeAsIds("Elle est grande"))
print("4:", sp.DecodePieces(sp.EncodeAsPieces("Elle est grande")))
print("5:", sp.DecodeIds(sp.EncodeAsIds("Elle est grande")))
print("6:", sp.EncodeAsPieces("elle est grande"))
print("7:", sp.EncodeAsIds("elle est grande"))

1: Size of vocabulary: 32000
2: ['▁Elle', '▁est', '▁grande', '.']
3: [85, 22, 258]
4: Elle est grande
5: Elle est grande
6: ['▁elle', '▁est', '▁grande']
7: [81, 22, 258]


In [7]:
print(f"Unk is a reserved control token in sentence that we cannot change - not even the symbol")
reserved_cases = ["<unk>"]
for s in reserved_cases:
    print(f"{s}({sp.PieceToId(s)})")

print(f"\nOur special cases and ids registrered as control token")
for s in special_cases:
    print(f"{s}({sp.PieceToId(s)})")


Unk is a reserved control token in sentence that we cannot change - not even the symbol
<unk>(0)

Our special cases and ids registrered as control token
xxmaj(2)
xxup(3)
xxrep(4)
xxwrep(5)
xxfld(6)


In [10]:
print("1:",sp.IdToPiece(1))
print("2:",sp.DecodeIds([1]))
print("3:",sp.IdToPiece(6))
print("4:",sp.DecodeIds([6]))
print("5:",sp.is_control(6))

1: <s>
2: 
3: xxfld
4: 
5: True
