In [None]:
from local.imports import *
from local.test import *
from local.core import *
from local.layers import *
from local.data.all import *
from local.notebook.showdoc import show_doc
from local.optimizer import *
from local.learner import *
from local.metrics import *
from local.text.data import *
from local.text.models.core import *
from local.text.models.awdlstm import *
from local.callback.rnn import *
from local.callback.all import *

# Integration test on Wikitext-2

> Training a Language Model on WT2

In [None]:
path = decompress_data(URLs.WIKITEXT_TINY)

The dataset comes with all the wrticles concatenated. We split them to be able to shuffle at the beginning of each epoch.

In [None]:
def istitle(line):
    return len(re.findall(r'^ = [^=]* = $', line)) != 0

def read_file(filename):
    articles = L()
    with open(filename, encoding='utf8') as f:
        lines = f.readlines()
    current_article = ''
    for i,line in enumerate(lines):
        current_article += line
        if i < len(lines)-2 and lines[i+1] == ' \n' and istitle(lines[i+2]):
            articles.append(current_article.split(' '))
            current_article = ''
    articles.append(current_article.split(' '))
    return articles

Then we put our list of tokenized texts together in an `LM_Dataset`. It will return tuples of sequences of `seq_len`, with the second sequence between the first one shifted by one on the right.

In [None]:
bs,sl = 104,72
train = LM_Dataset(read_file(path/'train.txt'), bs=bs, seq_len=sl, shuffle=True)
valid = LM_Dataset(read_file(path/'valid.txt'), bs=bs, seq_len=sl)

In [None]:
print(train[0])

(['', '\n', '=', 'Geopyxis', 'carbonaria', '=', '\n', '\n', 'Geopyxis', 'carbonaria', 'is', 'a', 'species', 'of', 'fungus', 'in', 'the', 'genus', 'Geopyxis', ',', 'family', '<unk>', '.', 'First', 'described', 'to', 'science', 'in', '1805', ',', 'and', 'given', 'its', 'current', 'name', 'in', '1889', ',', 'the', 'species', 'is', 'commonly', 'known', 'as', 'the', 'charcoal', 'loving', 'elf', '@-@', 'cup', ',', 'dwarf', '<unk>', 'cup', ',', '<unk>', '<unk>', 'cup', ',', 'or', 'pixie', 'cup', '.', 'The', 'small', ',', '<unk>', '@-@', 'shaped', 'fruitbodies', 'of', 'the'], ['\n', '=', 'Geopyxis', 'carbonaria', '=', '\n', '\n', 'Geopyxis', 'carbonaria', 'is', 'a', 'species', 'of', 'fungus', 'in', 'the', 'genus', 'Geopyxis', ',', 'family', '<unk>', '.', 'First', 'described', 'to', 'science', 'in', '1805', ',', 'and', 'given', 'its', 'current', 'name', 'in', '1889', ',', 'the', 'species', 'is', 'commonly', 'known', 'as', 'the', 'charcoal', 'loving', 'elf', '@-@', 'cup', ',', 'dwarf', '<unk>', 

We can then wrap our `LM_Dataset`s in a `TfmdList` to apply the `Numericalize` transform. We can't use a `TfmdDS` because our elements are already tuples and `TfmdDS` is there to create such tuples from individual items. Since we already have tuples, we specify `as_item=False`.

In [None]:
count = Counter([p for t in train.ds for p in t])
vocab = make_vocab(count)
train_ds = TfmdList(train, tfms=Numericalize(vocab), as_item=False, wrap_l=False)
valid_ds = TfmdList(valid, tfms=Numericalize(vocab), as_item=False, wrap_l=False)

Last but not least, we need to use a special sampler that will make sure we ask for the correct sequences to form a batch: in the first batch we don't want the sequences 0,1,2,3... (they are contiguous in the source obtained by concatenating all texts) but the sequences 0,`num_batches`,`2*num_batches`,...

In [None]:
train_dl = TfmdDL(train_ds, bs=bs, sampler=LM_Sampler(train), tfms=Cuda(), num_workers=0)
valid_dl = TfmdDL(valid_ds, bs=bs, sampler=LM_Sampler(valid), tfms=Cuda(), num_workers=0)

In [None]:
dbch = DataBunch(train_dl, valid_dl)
dbch.show_batch()

index,text,text_
0,"\n = Geopyxis carbonaria = \n \n Geopyxis carbonaria is a species of fungus in the genus Geopyxis , family <unk> . First described to science in 1805 , and given its current name in 1889 , the species is commonly known as the charcoal loving elf @-@ cup , dwarf <unk> cup , <unk> <unk> cup , or pixie cup . The small , <unk> @-@ shaped fruitbodies of the","\n = Geopyxis carbonaria = \n \n Geopyxis carbonaria is a species of fungus in the genus Geopyxis , family <unk> . First described to science in 1805 , and given its current name in 1889 , the species is commonly known as the charcoal loving elf @-@ cup , dwarf <unk> cup , <unk> <unk> cup , or pixie cup . The small , <unk> @-@ shaped fruitbodies of the fungus"
1,"Creek is in a sandstone and shale mountain region , it has a relatively low capacity to neutralize added acid . This makes it especially vulnerable to increased <unk> from acid rain , which poses a threat to the long term health of the plants and animals in the creek . The total <unk> ( TA ) is a measure of the capacity of water to neutralize acid , with a larger","is in a sandstone and shale mountain region , it has a relatively low capacity to neutralize added acid . This makes it especially vulnerable to increased <unk> from acid rain , which poses a threat to the long term health of the plants and animals in the creek . The total <unk> ( TA ) is a measure of the capacity of water to neutralize acid , with a larger TA"
2,"the homes of wealthy Jews . \n Finally her soliciting paid off and in 1881 , Rosebery was offered a government position acceptable to him , that of Under Secretary at the Home Office with special responsibility for Scotland . He had sought the position feeling that Scotland was neglected by the Liberal Government who were more interested in Ireland . However , immediately upon assuming the job he began to demand","homes of wealthy Jews . \n Finally her soliciting paid off and in 1881 , Rosebery was offered a government position acceptable to him , that of Under Secretary at the Home Office with special responsibility for Scotland . He had sought the position feeling that Scotland was neglected by the Liberal Government who were more interested in Ireland . However , immediately upon assuming the job he began to demand a"
3,"and Sweden , and the Danish and Swedish governments threatened to terminate the SAS agreement . On 25 November 1966 , with 82 against 62 votes , the Norwegian Parliament voted to allow Braathens SAFE to fly to Bodø and Tromsø as an extension of the West Coast route from 1 April 1967 . \n On 3 March 1967 , parliament decided to build four short take @-@ off and landing airports","Sweden , and the Danish and Swedish governments threatened to terminate the SAS agreement . On 25 November 1966 , with 82 against 62 votes , the Norwegian Parliament voted to allow Braathens SAFE to fly to Bodø and Tromsø as an extension of the West Coast route from 1 April 1967 . \n On 3 March 1967 , parliament decided to build four short take @-@ off and landing airports along"
4,"the outfitting center for Colorado gold <unk> headed for <unk> Peak in 1859 , 268 steamboats arrived at Omaha between March and November . \n With railroads becoming the dominant form of long @-@ range shipping and passenger travel in the early 1870s , <unk> like those in Omaha became obsolete . However , as late at 1949 the steamship Avalon was letting passengers in Omaha , before becoming one of the","outfitting center for Colorado gold <unk> headed for <unk> Peak in 1859 , 268 steamboats arrived at Omaha between March and November . \n With railroads becoming the dominant form of long @-@ range shipping and passenger travel in the early 1870s , <unk> like those in Omaha became obsolete . However , as late at 1949 the steamship Avalon was letting passengers in Omaha , before becoming one of the famous"
5,"of February 2016 , Vistara has a share of 2 % in the domestic carrier market . \n \n = = Corporate affairs = = \n \n In March 2015 , Vistara shifted to its new office at the One Horizon Center tower in Sector 43 , <unk> , a satellite city of Delhi . Vistara chose <unk> <unk> Yeoh as the chief executive officer ( CEO ) and <unk> Ming <unk>","February 2016 , Vistara has a share of 2 % in the domestic carrier market . \n \n = = Corporate affairs = = \n \n In March 2015 , Vistara shifted to its new office at the One Horizon Center tower in Sector 43 , <unk> , a satellite city of Delhi . Vistara chose <unk> <unk> Yeoh as the chief executive officer ( CEO ) and <unk> Ming <unk> as"
6,"the <unk> of the Indian National Congress and is still a major newspaper of Hindi northern India . \n \n = = = Art = = = \n \n Varanasi is a major centre of arts and designs . It is a producer of <unk> and <unk> with gold and silver thread work , carpet weaving , wooden toys , <unk> made of glass , ivory work , perfumes , artistic brass","<unk> of the Indian National Congress and is still a major newspaper of Hindi northern India . \n \n = = = Art = = = \n \n Varanasi is a major centre of arts and designs . It is a producer of <unk> and <unk> with gold and silver thread work , carpet weaving , wooden toys , <unk> made of glass , ivory work , perfumes , artistic brass and"
7,"of war under your charge to the State authorities , to be held subject to the action of the convention to be held on the 4th of March next . \n Perhaps because Abraham Lincoln had not yet been inaugurated as President , Captain Totten received no instructions from his superiors and was forced to withdraw his troops . He agreed to surrender the arsenal as long as the governor agreed to","war under your charge to the State authorities , to be held subject to the action of the convention to be held on the 4th of March next . \n Perhaps because Abraham Lincoln had not yet been inaugurated as President , Captain Totten received no instructions from his superiors and was forced to withdraw his troops . He agreed to surrender the arsenal as long as the governor agreed to three"
8,"the home . The decision to use a stay @-@ at @-@ home dad arrangement is most commonly due to economic reasons . At the same time , women are progressing into higher @-@ paying jobs . There are now financial ramifications in deciding whether the mother or father should become the stay @-@ at @-@ home parent . In cases where the woman is the higher @-@ paid parent , it","home . The decision to use a stay @-@ at @-@ home dad arrangement is most commonly due to economic reasons . At the same time , women are progressing into higher @-@ paying jobs . There are now financial ramifications in deciding whether the mother or father should become the stay @-@ at @-@ home parent . In cases where the woman is the higher @-@ paid parent , it makes"
9,"Allosaurus itself or at least the species A. fragillis , is technically a nomen dubium ( "" dubious name "" , based on a specimen too incomplete to compare to other specimens or to classify ) . In an attempt to fix this situation , Gregory S. Paul and Kenneth Carpenter ( 2010 ) submitted a petition to the <unk> to have the name A. fragillis officially transferred to the more complete","itself or at least the species A. fragillis , is technically a nomen dubium ( "" dubious name "" , based on a specimen too incomplete to compare to other specimens or to classify ) . In an attempt to fix this situation , Gregory S. Paul and Kenneth Carpenter ( 2010 ) submitted a petition to the <unk> to have the name A. fragillis officially transferred to the more complete specimen"


In [None]:
config = awd_lstm_lm_config.copy()
config.update({'input_p': 0.6, 'output_p': 0.4, 'weight_p': 0.5, 'embed_p': 0.1, 'hidden_p': 0.2})
model = get_language_model(AWD_LSTM, len(vocab), config=config)

In [None]:
opt_func = partial(Adam, wd=0.1, eps=1e-7)
cb_funcs = [partial(MixedPrecision, clip=0.1), partial(RNNTrainer, alpha=3, beta=2)]

In [None]:
learn = Learner(model, dbch, loss_func=CrossEntropyLossFlat(), opt_func=opt_func, cb_funcs=cb_funcs, metrics=[accuracy, Perplexity()])

In [None]:
%prun learn.fit_one_cycle(1, 5e-3, moms=(0.8,0.7,0.8), div=10)

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,5.557487,5.043018,0.244405,154.93689,01:52


 