In [1]:
from fastai import *
from fastai.text import * 

import sentencepiece as spm
from pathlib import *
import shutil

In [2]:
!pip install sentencepiece



In [3]:
path       = Path("../../data/nlp-data/fr/")
cache_name = "sp-model"
special_cases=[ 
    text.transform.BOS,
    text.transform.PAD,
    text.transform.TK_MAJ,
    text.transform.TK_UP,
    text.transform.TK_REP,
    text.transform.TK_WREP,
    text.transform.FLD
] 

In [4]:
import sentencepiece as spm

class SentencepieceWikiModel:
    def __init__(self, path:Path, cache_name:str='sp-model', 
                 vocab_size:int=32000, model_type:str='unigram', 
                 rules=text.transform.default_pre_rules ):
        self.path           = path
        self.pathTrainValid = path/"wiki-train_valid"        
        self.pathVocab      = self.pathTrainValid / cache_name
        self.vocab_size = vocab_size
        self.model_type = model_type
        self.rules = rules
        
        self.pathVocab.mkdir(parents=True, exist_ok=True)

    def prepareWiki_old():
        tmpDir = self.pathTrainValid/"tmp"    
        if tmpDir.exists(): shutil.rmtree(tmpDir, ignore_errors=True)
        tmpDir.mkdir(parents=True,exist_ok=True)
    
        txt = []
        noSrcParts = len(pathJson.parts)
        for fn in pathJson.glob("**/wiki*"):
            with fn.open(encoding='utf-8') as f:
                txt = reduce(lambda t, rule: rule(t), rules, f.read())
            
                if len(txt)>0:
                    p = tmpDir.joinpath(*fn.parts[noSrcParts:])
                    p.parent.mkdir(exist_ok=True)
                    with p.open("w+") as fw:
                        fw.write(txt)
                        
    def wikidump2TrainingData(self):
        "generate text files for training af sentencepiece vocabulary " \
        "and a csv-file for training a languagemodel with the vocabulary and the wiki-text in the csv-file"
        pathJson = self.path/"wiki-json"
        pathTxt  = self.pathTrainValid/"txt"
        
        txt = []
        noSrcParts = len(pathJson.parts)
        for fn in pathJson.glob("**/wiki*"):
            with open(fn, encoding='utf-8') as f:
                sections = []
                for line in f:
            
                    section = json.loads(line)
                    if section['text'].find(section['title']) >=0 :
                        section['text'] = section['text'][len(section['title'])+2:]
                
                
                    section['text']      = section['text'].replace('\n\n', "\n")
                    section['text']      = reduce(lambda t, rule: rule(t), self.rules, section['text'])
                    section['textWords'] = len(re.findall(r'\w+',section['text']))
            
                    if section['textWords'] > 0:
                        txt.append(section)
                        sections.append(section)
                
                if len(sections)>0:
                    p = pathTxt.joinpath(*fn.parts[noSrcParts:])
                    p.parent.mkdir(parents=True, exist_ok=True)
                    with p.open("w+") as fw:
                        for s in sections:
                            fw.write(s["text"])
                
        pd.DataFrame(txt).to_csv(self.pathTrainValid/'wiki.csv',index=False)        

    def trainVocabulary(self): 
        pathSrcTxt   = self.pathTrainValid / "txt"
        model_prefix = self.pathVocab / "m"
    
        #Set the following controls to sentencepiece values until there is a release where we can set the token value
        #Note taku910 has already made the change but the pip of sentencepiewce version has not been updated 
        text.transform.UNK = "<unk>"
        #text.transform.BOS = "<s>"
        #text.transform.PAD = "<pad>"
    
        #create control ids for the rest of the fastai control tokens in case the user needs them
        #it is the responsibility of fastai to generate and use the control tokens them and apply them before decoding
        #Fx applying TK_MAJ after tokenization would change She to two token TK_MAJ+she.
        #Problem! Sentencepiece would tokenize "Elle" as _Elle so our deal_caps would not catch it
        special_cases=[ 
                        text.transform.BOS,
                        text.transform.PAD,
                        text.transform.TK_MAJ,
                        text.transform.TK_UP,
                        text.transform.TK_REP,
                        text.transform.TK_WREP,
                        text.transform.FLD ] 
        str_specialcases = ",".join(special_cases) 
    
        pathSrc_list = [str(s) for s in pathSrcTxt.glob("**/wiki*")]
        pathSrc_list= ",".join(pathSrc_list)
    
        sp_params = f"--input={pathSrc_list} "  \
                    f"--bos_id=-1 " \
                    f"--eos_id=-1 " \
                    f"--pad_id=-1 " \
                    f"--user_defined_symbols={str_specialcases} " \
                    f"--character_coverage=1.0 " \
                    f"--model_prefix={model_prefix} " \
                    f"--vocab_size={self.vocab_size} " \
                    f"--model_type={self.model_type} " 
    
        #f"--split_by_number=1 " \
        #hard_vocab_limit=False
        #use_all_vocab
        #print(sp_params)
        spm.SentencePieceTrainer.Train(sp_params)
        
        #convert sentencepieces vocabulary to a format fastai can read
        with open( self.pathVocab/"m.vocab", 'r') as f:
            vocab = [line.split('\t')[0] for line in f.readlines()]
        pickle.dump(vocab, open( self.pathVocab / "itos.pkl", "wb"))
        

In [5]:
class SentencepieceTokenizer(BaseTokenizer):
    def __init__(self, lang:str):
        path,cache_name = pathTrainValid, "sp-model"
    #def __init__(self, path:PathOrStr, cache_name:str='sp-model'):
        self.pathVocab = path / cache_name
        self.vocab_    = Vocab(pickle.load(open(self.pathVocab/'itos.pkl', 'rb')))
        self.tok       = spm.SentencePieceProcessor()
        
        self.tok.Load(str(Path(path) / cache_name / 'm.model'))
        text.transform.UNK = "<unk>"

    def tokenizer(self, t:str) -> List[str]:
        return self.tok.EncodeAsPieces(t)
    
    def add_special_cases(self, toks:Collection[str]):
        #this should have been done when training sentencepiece
        pass
    
    def vocab(self): return self.vocab_

In [None]:
spwm = SentencepieceWikiModel(path=path)
%time spwm.wikidump2TrainingData()

In [6]:
spwm = SentencepieceWikiModel(path=path)

%time spwm.trainVocabulary()

CPU times: user 46min 13s, sys: 16.8 s, total: 46min 30s
Wall time: 12min 59s


In [10]:
sp = spm.SentencePieceProcessor()
p = path /"wiki-train_valid"/ cache_name / "m.model"
print(p)
sp.Load(str(p))
print("1: Size of vocabulary:",sp.GetPieceSize())
sentence = "Elle est grande. Il est petit"
print("2:", sp.EncodeAsPieces(sentence))
print("3:", sp.EncodeAsIds(sentence))
print("4:", sp.DecodePieces(sp.EncodeAsPieces(sentence)))
print("5:", sp.DecodeIds(sp.EncodeAsIds(sentence)))

../../data/nlp-data/fr/wiki-train_valid/sp-model/m.model
1: Size of vocabulary: 32000
2: ['▁Elle', '▁est', '▁grande', '.', '▁Il', '▁est', '▁petit']
3: [89, 23, 254, 11, 43, 23, 560]
4: Elle est grande. Il est petit
5: Elle est grande. Il est petit


In [11]:
print(f"Unk is a reserved control token in sentence that we cannot change - not even the symbol")
reserved_cases = ["<unk>","<s>","<pad>"]
for s in reserved_cases:
    print(f"{s}({sp.PieceToId(s)})")

print(f"\nOur special cases and ids registrered as control token")
for s in special_cases:
    print(f"{s}({sp.PieceToId(s)})")


Unk is a reserved control token in sentence that we cannot change - not even the symbol
<unk>(0)
<s>(0)
<pad>(0)

Our special cases and ids registrered as control token
xxbos(1)
xxpad(2)
xxmaj(3)
xxup(4)
xxrep(5)
xxwrep(6)
xxfld(7)


In [12]:
print("1:",sp.IdToPiece(1))
print("2:",sp.DecodeIds([1]))
print("3:",sp.IdToPiece(6))
print("4:",sp.DecodeIds([6]))
print("5:",sp.is_control(6))

1: xxbos
2: xxbos
3: xxwrep
4: xxwrep
5: False


In [None]:
pathSP=Path("../nlp-data/fr/wiki-train_valid")