In [None]:
from fastai2.basics import *
from fastai2.text.all import *
from fastai2.callback.all import *
from nbdev.showdoc import show_doc

# ULMFiT

## Finetune a pretrained Language Model

First we get our data and tokenize it.

In [None]:
path = untar_data(URLs.IMDB)
tokenize_folder(path, folders=['train', 'test', 'unsup'])

In [None]:
path = untar_data(URLs.IMDB).parent/'imdb_tok'

In [None]:
count = pickle.load(open(path/'counter.pkl', 'rb'))
vocab = make_vocab(count)

In [None]:
texts = get_files(path, extensions=['.txt'])
len(texts)

Then we put it in a `DataSource`. For a language model, we don't have targets, so there is only one transform to numericalize the texts. Note that `tokenize_df` returns the count of the words in the corpus to make it easy to create a vocabulary.

In [None]:
def read_file(f): return L(f.read().split(' '))

In [None]:
texts

In [None]:
splits = RandomSplitter(valid_pct=0.1)(texts)
vocab = make_vocab(count)
dsrc = DataSource(texts, [[read_file, Numericalize(vocab)]], splits=splits, dl_type=LMDataLoader)

Then we use that `DataSource` to create a `DataBunch`. Here the class of `TfmdDL` we need to use is `LMDataLoader` which will concatenate all the texts in a source (with a shuffle at each epoch for the training set), split it in `bs` chunks then read continuously through it.

In [None]:
bs,sl=64,80
dbunch_lm = dsrc.databunch(bs=bs, seq_len=sl, val_bs=bs, after_batch=Cuda)

In [None]:
dbunch_lm.show_batch()

Then we have a convenience method to directly grab a `Learner` from it, using the `AWD_LSTM` architecture.

In [None]:
opt_func = partial(Adam, wd=0.1)

learn = language_model_learner(dbunch_lm, AWD_LSTM, opt_func=opt_func, metrics=[accuracy, Perplexity()], path=path)
# learn = learn.to_fp16(clip=0.1)

In [None]:
learn.fit_one_cycle(1, 2e-2, moms=(0.8,0.7,0.8))

In [None]:
learn.save('stage1')

In [None]:
learn.load('stage1');

In [None]:
learn.unfreeze()
learn.fit_one_cycle(10, 2e-3, moms=(0.8,0.7,0.8))

Once we have fine-tuned the pretrained language model to this corpus, we save the encoder since we will use it for the classifier.

In [None]:
learn.save_encoder('finetuned1')

## Use it to train a classifier

In [None]:
texts = get_files(path, extensions=['.txt'], folders=['train', 'test'])

In [None]:
splits = GrandparentSplitter(valid_name='test')(texts)

For classification, we need to use two set of transforms: one to numericalize the texts and the other to encode the labels as categories.

In [None]:
dsrc = DataSource(texts, [[read_file, Numericalize(vocab)], [parent_label, Categorize()]], splits=splits, dl_type=SortedDL)

In [None]:
bs = 64

In [None]:
dbunch = dsrc.databunch(before_batch=pad_input, after_batch=Cuda, bs=bs)

In [None]:
dbunch.show_batch(max_n=2)

Then we once again have a convenience function to create a classifier from this `DataBunch` with the `AWD_LSTM` architecture.

In [None]:
opt_func = partial(Adam, wd=0.1)
learn = text_classifier_learner(dbunch, AWD_LSTM, vocab, metrics=[accuracy], path=path, drop_mult=0.5, opt_func=opt_func)

We load our pretrained encoder.

In [None]:
learn = learn.load_encoder('finetuned1')
learn = learn.to_fp16(clip=0.1)

Then we can train with gradual unfreezing and differential learning rates.

In [None]:
lr = 1e-1 * bs/128

In [None]:
learn.fit_one_cycle(1, lr, moms=(0.8,0.7,0.8), wd=0.1)

In [None]:
learn.freeze_to(-2)
lr /= 2
learn.fit_one_cycle(1, slice(lr/(2.6**4),lr), moms=(0.8,0.7,0.8), wd=0.1)

In [None]:
learn.freeze_to(-3)
lr /= 2
learn.fit_one_cycle(1, slice(lr/(2.6**4),lr), moms=(0.8,0.7,0.8), wd=0.1)

In [None]:
learn.unfreeze()
lr /= 5
learn.fit_one_cycle(2, slice(lr/(2.6**4),lr), moms=(0.8,0.7,0.8), wd=0.1)