In [None]:
%load_ext autoreload
%autoreload 2
from fastai import *
from fastai.text import * 
import sentencepiece as spm


In [None]:
pathTrainValid=Path("../../data/nlp-data/fr/wiki-train_valid")

# Data preparation

Load the training and validation data we prepared in wiki_preparation.ipynb. In total 100 million articles with a split of 80% / 20% for training/validation:
- First column: text content to train the model. 
- Second column: Boolean representing if the data is for training or validation.

# Create Tokenizer
The sentencepiece vocabulary was trained in Train Sentencepiece tokenizer.ipynb.

Here we will make a BasicTokenizer from Sentencepiece so that fastai can use it instead of spacy

In [None]:
class SentencepieceTokenizer(BaseTokenizer):
    def __init__(self, lang:str):
        path,cache_name = pathTrainValid, "sp-model"
    #def __init__(self, path:PathOrStr, cache_name:str='sp-model'):
        self.pathVocab = path / cache_name
        self.vocab_    = Vocab(pickle.load(open(self.pathVocab/'itos.pkl', 'rb')))
        self.tok       = spm.SentencePieceProcessor()
        
        self.tok.Load(str(Path(path) / cache_name / 'm.model'))
        text.transform.UNK = "<unk>"

    #def __call__(self, language:str): return self    
        
    def tokenizer(self, t:str) -> List[str]:
        return self.tok.EncodeAsPieces(t)
    
    def add_special_cases(self, toks:Collection[str]):
        #this should have been done when training sentencepiece
        pass
    
    def vocab(self): return self.vocab_

In [None]:
spt       = SentencepieceTokenizer(lang="fr")
tokenizer = Tokenizer(SentencepieceTokenizer,"fr")

In [None]:
print(spt.vocab().textify(np.arange(30)))

sentence = ["Elle est grande. Il est petit", "Il est petit. Elle est grande."]
#tokenizer._process_all_1(sentence)

In [None]:
spt.vocab().numericalize( ["<unk>" ,"xxbos" ,"xxpad" ,"xxmaj" ,"xxup" ,"xxrep" ,"xxwrep", "xxfld", "▁de"]  )

In [None]:
tok = spt.vocab().itos
print(len(tok))
#[print(t) for t in tok]
tokenizer

# LM Training


We train two LM: one with a 60k vocabulary and one with a 30k vocabulary. The two models have different performance and computation needs.

In [None]:
#import timeit start = timeit.default_timer()

vocab,max_vocab  = spt.vocab(), len(spt.vocab().itos)
#print(timeit.default_timer()-start )

In [None]:
#%%debug
#import pdb; .set_trace() 


data_lm_full = TextLMDataBunch.from_csv( pathTrainValid, csv_name='deepfrance.csv', text_cols=0, label_cols=1,
                                         tokenizer=tokenizer, vocab=vocab,
                                         max_vocab=max_vocab,
                                         min_freq=0
                                 )

In [None]:
data_lm_full.save('full_lm')

In [None]:
data_lm_full = TextLMDataBunch.load(pathTrainValid, 'full_lm', bs=32)

In [None]:
len(data_lm_full.train_ds.vocab.itos)

In [None]:
data_lm_full.show_batch()

In [None]:
learn = language_model_learner(data_lm_full, drop_mult=0, qrnn=False, pad_token=-1, callback_fns=ShowGraph)
learn.lr_find()
learn.recorder.plot(skip_start=0)

In [None]:
learn = language_model_learner(data_lm_full, drop_mult=0, qrnn=False, pad_token=-1, callback_fns=ShowGraph)
learn.fit_one_cycle(10, 2e-3, moms=(0.8,0.7))

In [None]:
learn.save('model-30k-sentencepiece-vocab')

In [None]:
print(text.transform.BOS)
print(text.transform.FLD)
print(text.transform.PAD)
type(data_lm_full.train_ds.y)

In [None]:
np.exp(4)

In [None]:
np.exp(3.239415)

Now we train a model with a 30k vocabulary.
Because of this, batch size can be higher and training is quicker.

In [None]:
data_lm_full = (TextList.from_csv(PATH, csv_name='fulltrain.csv', cols=0, processor=[TokenizeProcessor(tokenizer=tokenizer), NumericalizeProcessor(max_vocab=30000)])
           #Inputs: all the text files in path
            .split_from_df(col=1)
           #We may have other temp folders that contain text files so we only keep what's in train and test
            .label_for_lm()           
           #We want to do a language model so we label accordingly
            .databunch(bs=64))

In [None]:
data_lm_full.save('full_lm_30k')

In [None]:
data_lm_full = TextLMDataBunch.load(PATH, 'full_lm_30k', bs=64)

In [None]:
len(data_lm_full.train_ds.vocab.itos)

In [None]:
learn = language_model_learner(data_lm_full, drop_mult=0, qrnn=False, callback_fns=ShowGraph)
learn.lr_find()
learn.recorder.plot(skip_start=0)

In [None]:
learn = language_model_learner(data_lm_full, drop_mult=0, qrnn=False, callback_fns=ShowGraph)

In [None]:
learn.fit_one_cycle(5, 1e-3)

The model perplexity is exp(validation loss):

In [None]:
np.exp(3.377596)

In [None]:
learn.save('model-30k-vocab')