In [1]:
%load_ext autoreload
%autoreload 2
from fastai import *
from fastai.text import * 
import sentencepiece as spm


In [2]:
path=Path("../nlp-data/fr/wiki-train_valid")

# Data preparation

Load the training and validation data we prepared in wiki_preparation.ipynb. In total 100 million articles with a split of 80% / 20% for training/validation:
- First column: text content to train the model. 
- Second column: Boolean representing if the data is for training or validation.

In [3]:
dfText = pd.read_csv(path/'wiki.csv')
#sort increasing and take the last nrows for training and validatiom
dfText.sort_values('textWords', inplace=True)
display(dfText.head())

#select the sections with most words
nrows  = int(1e6)
dfText = dfText[-nrows:]

#split the data into train and validation
split, index = 0.2, np.random.permutation(np.arange(nrows))
splitindex   = int(nrows*split+.5)

dfText.drop(labels=["title","url","textWords"],axis=1,inplace=True)

dfText["is_valid"] = False
dfText.iloc[index[:splitindex], dfText.columns.get_loc("is_valid")] = True
dfText.to_csv(path/"train_and_valid.csv", header=None, index=None)

train_and_valid = pd.read_csv(path/'train_and_valid.csv', header=None, names=['label', 'content'])

Unnamed: 0,id,text,textWords,title,url
184201,10468104,Références.\n \n,1,Nord and Bert Couldn't Make Head or Tail of It,https://fr.wikipedia.org/wiki?curid=10468104
1264477,7052901,"\nPatronyme.\n,",1,Contaut,https://fr.wikipedia.org/wiki?curid=7052901
851122,5923581,Eichner:\n,1,Eichner,https://fr.wikipedia.org/wiki?curid=5923581
22453,4094043,Balleny:\n,1,Balleny,https://fr.wikipedia.org/wiki?curid=4094043
728494,6124549,Scholtz:,1,Scholtz,https://fr.wikipedia.org/wiki?curid=6124549


In [4]:
train_and_valid.head()

Unnamed: 0,label,content
7840298,"Georges Turines, né le à Lautignac et mort le ...",False
3343103,Le Lac Travis est un réservoir situé sur le Co...,False
1593728,"Vraptchichté (en , en ) est une municipalité d...",False
4139306,Tigran Levonovitch Petrossian (en ) est un gra...,False
5636896,L'Oberlin noir (N595) est un cépage.\nDescript...,True


# Create Tokenizer
The sentencepiece vocabulary was trained in Train Sentencepiece tokenizer.ipynb.

Here we will make a BasicTokenizer from Sentencepiece so that fastai can use it instead of spacy

In [18]:
class SentencepieceTokenizer(BaseTokenizer):
    def __init__(self, path:PathOrStr, cache_name:str='tmp'):
        text.transform.UNK = "<unk>"
        text.transform.BOS = "<s>"
        text.transform.PAD = "<pad>"
        
        self.tok = spm.SentencePieceProcessor()
        self.tok.Load(str(Path(path) / cache_name / 'm.model'))
        
        self.vocab_ = SentencepieceTokenizer.loadvocab_(path, cache_name)
        
    def tokenizer(self, t:str) -> List[str]:
        #get the tokens and replace unk from sentencepiece with unk from fastai
        return self.tok.EncodeAsPieces(t)
        #return [text.transform.UNK if t=="<unk>" else t for t in self.tok.EncodeAsPieces(t)]
    
    def add_special_cases(self, toks:Collection[str]):
        #this should have been done when training sentencepiece
        pass
    
    
    def vocab(self): return self.vocab_
    @staticmethod
    def loadvocab_(path:PathOrStr, cache_name):
        p_vocab = Path(path) / cache_name / "m.vocab"
        with open(str(p_vocab), 'r') as f:
            vocab = [line.split('\t')[0] for line in f.readlines()]
        
        p_itos = Path(path) / cache_name / 'itos.pkl'
        pickle.dump(vocab, open( p_itos, 'wb'))
        vocab_ = Vocab(pickle.load(open(p_itos, 'rb')))
        return vocab_

spt       = SentencepieceTokenizer(path, cache_name="sp-model")
tokenizer = Tokenizer(tok_func=spt)


['<unk>', '<s>', '</s>', '▁de', '.', ',', "'", 's', '▁la', '▁et']


In [22]:
vocab = pt.vocab()
print(type(vocab))
vocab.textify(np.arange(10))
vocab.numericalize(["<unk>", "<s>", "</s>", "▁de", "." ,"s", "▁la", "▁et"])

<class 'fastai.text.transform.Vocab'>


[0, 1, 2, 3, 4, 7, 8, 9]

In [23]:
tokenizer = SentencepieceTokenizer(path,"sp-model")
print(text.transform.BOS)
print(text.transform.UNK)
print(text.transform.FLD)
print(text.transform.PAD)
tokenizer.vocab()

['<unk>', '<s>', '</s>', '▁de', '.', ',', "'", 's', '▁la', '▁et']
<s>
<unk>
xxfld
<pad>


<fastai.text.transform.Vocab at 0x1a804a42b0>

# LM Training


We train two LM: one with a 60k vocabulary and one with a 30k vocabulary. The two models have different performance and computation needs.

In [None]:
data_lm_full = (TextList.from_csv(path, csv_name="train_and_valid.csv", tokenizer=tokenizer, vocab=tokenizer.vocab())
                #Inputs: all the text files in path
                #.split_from_df(col=1)
                #We may have other temp folders that contain text files so we only keep what's in train and test
                #.label_for_lm()           
                #We want to do a language model so we label accordingly
                .databunch(bs=32)
               )

In [None]:
data_lm_full.save('full_lm_60k')

In [None]:
data_lm_full = TextLMDataBunch.load(PATH, 'full_lm_60k', bs=32)

In [None]:
len(data_lm_full.train_ds.vocab.itos)

In [None]:
data_lm_full.show_batch()



In [None]:
learn = language_model_learner(data_lm_full, drop_mult=0, qrnn=True, callback_fns=ShowGraph)
learn.lr_find()
learn.recorder.plot(skip_start=0)

In [None]:
learn = language_model_learner(data_lm_full, drop_mult=0, qrnn=True, callback_fns=ShowGraph)
learn.fit_one_cycle(10, 2e-3, moms=(0.8,0.7))

In [None]:
learn.fit_one_cycle(5, 2e-3, moms=(0.8,0.7))

In [None]:
learn.fit_one_cycle(1, 2e-4, moms=(0.8,0.7))

In [None]:
learn.save('model-60k-vocab')

Now we train a model with a 30k vocabulary.
Because of this, batch size can be higher and training is quicker.

In [None]:
data_lm_full = (TextList.from_csv(PATH, csv_name='fulltrain.csv', cols=0, processor=[TokenizeProcessor(tokenizer=tokenizer), NumericalizeProcessor(max_vocab=30000)])
           #Inputs: all the text files in path
            .split_from_df(col=1)
           #We may have other temp folders that contain text files so we only keep what's in train and test
            .label_for_lm()           
           #We want to do a language model so we label accordingly
            .databunch(bs=64))

In [None]:
data_lm_full.save('full_lm_30k')

In [None]:
data_lm_full = TextLMDataBunch.load(PATH, 'full_lm_30k', bs=64)

In [None]:
len(data_lm_full.train_ds.vocab.itos)

In [None]:
learn = language_model_learner(data_lm_full, drop_mult=0, qrnn=False, callback_fns=ShowGraph)
learn.lr_find()
learn.recorder.plot(skip_start=0)

In [None]:
learn = language_model_learner(data_lm_full, drop_mult=0, qrnn=False, callback_fns=ShowGraph)

In [None]:
learn.fit_one_cycle(5, 1e-3)

The model perplexity is exp(validation loss):

In [None]:
np.exp(3.377596)

In [None]:
learn.save('model-30k-vocab')