# NLP with Fastai

In [1]:
from fastai2.text.all import *

In [2]:
path = untar_data(URLs.IMDB)

In [3]:
path.ls()

(#7) [Path('/home/jupyter/.fastai/data/imdb/README'),Path('/home/jupyter/.fastai/data/imdb/imdb.vocab'),Path('/home/jupyter/.fastai/data/imdb/unsup'),Path('/home/jupyter/.fastai/data/imdb/train'),Path('/home/jupyter/.fastai/data/imdb/tmp_lm'),Path('/home/jupyter/.fastai/data/imdb/tmp_clas'),Path('/home/jupyter/.fastai/data/imdb/test')]

In [4]:
# get the files
files = get_text_files(path, folders = ['train','test','unsup'])

In [5]:
txt = files[0].open().read()
txt[:80]

"Am not from America, I usually watch this show on AXN channel, I don't know why "

In [6]:
tokenizer = WordTokenizer()
tokens = first(tokenizer([txt]))
print(coll_repr(tokens, 40))

(#215) ['Am','not','from','America',',','I','usually','watch','this','show','on','AXN','channel',',','I','do',"n't",'know','why','this','respected','channel','air','such','sucking','program','in','prime','time','slot','.','Creation','of','Hollywood',"'s",'Money','Bank','Jerry','Bruckheimer',','...]


In [7]:
first??

In [8]:
first(tokenizer(['Gravity\s Rainbow costs $10.99.']))

(#6) ['Gravity\\s','Rainbow','costs','$','10.99','.']

In [9]:
tkn = Tokenizer(tokenizer)
print(coll_repr(tkn(txt),40))

(#240) ['xxbos','xxmaj','am','not','from','xxmaj','america',',','i','usually','watch','this','show','on','xxup','axn','channel',',','i','do',"n't",'know','why','this','respected','channel','air','such','sucking','program','in','prime','time','slot','.','xxmaj','creation','of','xxmaj','hollywood'...]


In [10]:
# special tokens rules
defaults.text_proc_rules

[<function fastai2.text.core.fix_html(x)>,
 <function fastai2.text.core.replace_rep(t)>,
 <function fastai2.text.core.replace_wrep(t)>,
 <function fastai2.text.core.spec_add_spaces(t)>,
 <function fastai2.text.core.rm_useless_spaces(t)>,
 <function fastai2.text.core.replace_all_caps(t)>,
 <function fastai2.text.core.replace_maj(t)>,
 <function fastai2.text.core.lowercase(t, add_bos=True, add_eos=False)>]

In [11]:
L??

In [12]:
txts = L(i.open().read() for i in files[:2000])

### Subword Tokenization

In [13]:
def subwords(sz):
    sub_tok = SubwordTokenizer(vocab_sz=sz)
    sub_tok.setup(txts)
    return ' '.join(first(sub_tok([txt]))[:40])

In [14]:
!pip install sentencepiece!=0.1.90,!=0.1.91



In [15]:
subwords(1000)

"▁A m ▁not ▁from ▁A mer ic a , ▁I ▁usual ly ▁watch ▁this ▁show ▁on ▁A X N ▁ch an n el , ▁I ▁don ' t ▁know ▁why ▁this ▁re s pe c t ed ▁ch an n"

In [16]:
"""smaller vocab, each token will represent fewer characters,
   and it will take more tokensto represent a sentence:"""
subwords(200)

'▁A m ▁ n o t ▁f ro m ▁A m er ic a , ▁I ▁ u s u al ly ▁w a t ch ▁this ▁sh ow ▁on ▁A X N ▁ ch an ne l , ▁I'

In [17]:
"""if we use a larger vocab, most common English words will end up in the vocab themselves,
and we will not need as many to represent a sentence:"""
subwords(10000)

"▁Am ▁not ▁from ▁America , ▁I ▁usual ly ▁watch ▁this ▁show ▁on ▁ AX N ▁channel , ▁I ▁don ' t ▁know ▁why ▁this ▁respected ▁channel ▁air ▁such ▁suck ing ▁program ▁in ▁prime ▁time ▁slot . ▁C re ation ▁of"

### Numericalization

In [18]:
toks = tkn(txt)
print(coll_repr(tkn(txt), 31))

(#240) ['xxbos','xxmaj','am','not','from','xxmaj','america',',','i','usually','watch','this','show','on','xxup','axn','channel',',','i','do',"n't",'know','why','this','respected','channel','air','such','sucking','program','in'...]


In [19]:
toks200 = txts[:200].map(tkn)
toks200[0]

(#240) ['xxbos','xxmaj','am','not','from','xxmaj','america',',','i','usually'...]

In [20]:
num = Numericalize()
num.setup(toks200)
coll_repr(num.vocab,20)

"(#2072) ['xxunk','xxpad','xxbos','xxeos','xxfld','xxrep','xxwrep','xxup','xxmaj','the','.',',','a','and','of','to','is','in','it','i'...]"

In [21]:
nums = num(toks[:20])
nums

TensorText([   2,    8,  233,   34,   51,    8,  716,   11,   19,  818,  117,   20,
         116,   32,    7,    0, 1519,   11,   19,   65])

In [22]:
' '.join(num.vocab[i] for i in nums)

'xxbos xxmaj am not from xxmaj america , i usually watch this show on xxup xxunk channel , i do'

In [23]:
nums200 = toks200.map(num)

In [24]:
data_loader = LMDataLoader(nums200)

In [25]:
x,y = first(data_loader)
x.shape, y.shape

(torch.Size([64, 72]), torch.Size([64, 72]))

In [27]:
' '.join(num.vocab[o] for o in x[0][:20])

'xxbos xxmaj am not from xxmaj america , i usually watch this show on xxup xxunk channel , i do'

In [28]:
' '.join(num.vocab[o] for o in y[0][:20])

"xxmaj am not from xxmaj america , i usually watch this show on xxup xxunk channel , i do n't"

## Training a Text Classifier

In [29]:
get_imdb = partial(get_text_files, folders=['train','test','unsup'])

dls_lm = DataBlock(
            blocks=TextBlock.from_folder(path, is_lm=True),
            get_items = get_imdb,
            splitter = RandomSplitter(0.1)
).dataloaders(path, path=path, bs=128, seq_len=80)

In [30]:
dls_lm.show_batch(max_n=2)

Unnamed: 0,text,text_
0,"xxbos the film was overly sentimental and over dramatic . xxmaj moreover , in spite of the lavish attention to period baseball with the appearance of xxup cgi stadiums along with period uniforms and gloves , it seems to me that they failed to understand a simple fact of baseball . xxmaj in the final game of the xxmaj world xxmaj series , played in xxmaj detroit , the main character , xxmaj joe , runs out on the field","the film was overly sentimental and over dramatic . xxmaj moreover , in spite of the lavish attention to period baseball with the appearance of xxup cgi stadiums along with period uniforms and gloves , it seems to me that they failed to understand a simple fact of baseball . xxmaj in the final game of the xxmaj world xxmaj series , played in xxmaj detroit , the main character , xxmaj joe , runs out on the field to"
1,"remember taking a couple of special friends on a date to see the movie and them being as moved and teary - eyed at the end as i was . xxmaj i 'm both anxious and nervous to find a copy and see it now . xxmaj so many movies which seemed so important to me back then ( i.e. "" the xxmaj graduate "" "" easy xxmaj rider "" ) now just seem silly and i do n't want","taking a couple of special friends on a date to see the movie and them being as moved and teary - eyed at the end as i was . xxmaj i 'm both anxious and nervous to find a copy and see it now . xxmaj so many movies which seemed so important to me back then ( i.e. "" the xxmaj graduate "" "" easy xxmaj rider "" ) now just seem silly and i do n't want this"


### Fine Tunning Language Model

In [33]:
learn = language_model_learner(
    dls_lm, AWD_LSTM, drop_mult=0.3,
    metrics=[accuracy, Perplexity()]
)

In [None]:
learn.fit_one_cycle(1, 2e-2)

epoch,train_loss,valid_loss,accuracy,perplexity,time
