# Source Code

In [1]:
%reload_ext autoreload
%autoreload 2 
%matplotlib inline

In [None]:
! pip install sentencepiece

In [2]:
from fastai.text import *
import pandas as pd

In [3]:
import warnings
warnings.filterwarnings('ignore')

## Preparing the data

## Language model

Note that language models can use a lot of GPU, so you may need to decrease batchsize here.

In [4]:
bs=128

Now let's grab the full dataset for what follows.

In [5]:
path = Path("/tf/data/datasets/raw/raw_java/data00m_god-r")
path.ls()

[PosixPath('/tf/data/datasets/raw/raw_java/data00m_god-r/sm_test.csv'),
 PosixPath('/tf/data/datasets/raw/raw_java/data00m_god-r/sm_valid.csv'),
 PosixPath('/tf/data/datasets/raw/raw_java/data00m_god-r/test.csv'),
 PosixPath('/tf/data/datasets/raw/raw_java/data00m_god-r/train.csv'),
 PosixPath('/tf/data/datasets/raw/raw_java/data00m_god-r/data_clas.pkl'),
 PosixPath('/tf/data/datasets/raw/raw_java/data00m_god-r/valid.csv'),
 PosixPath('/tf/data/datasets/raw/raw_java/data00m_god-r/sm_train.csv'),
 PosixPath('/tf/data/datasets/raw/raw_java/data00m_god-r/data_lm.pkl'),
 PosixPath('/tf/data/datasets/raw/raw_java/data00m_god-r/tmp'),
 PosixPath('/tf/data/datasets/raw/raw_java/data00m_god-r/sm_test'),
 PosixPath('/tf/data/datasets/raw/raw_java/data00m_god-r/sm_train'),
 PosixPath('/tf/data/datasets/raw/raw_java/data00m_god-r/train'),
 PosixPath('/tf/data/datasets/raw/raw_java/data00m_god-r/models'),
 PosixPath('/tf/data/datasets/raw/raw_java/data00m_god-r/tmp.sh'),
 PosixPath('/tf/data/datas

In [None]:
(path/'train').ls()

The reviews are in a training and test set following an imagenet structure. The only difference is that there is an `unsup` folder on top of `train` and `test` that contains the unlabelled data.

We're not going to train a model that classifies the reviews from scratch. Like in computer vision, we'll use a model pretrained on a bigger dataset (a cleaned subset of wikipedia called [wikitext-103](https://einstein.ai/research/blog/the-wikitext-long-term-dependency-language-modeling-dataset)). That model has been trained to guess what the next word is, its input being all the previous words. It has a recurrent structure and a hidden state that is updated each time it sees a new word. This hidden state thus contains information about the sentence up to that point.

We are going to use that 'knowledge' of the English language to build our classifier, but first, like for computer vision, we need to fine-tune the pretrained model to our particular dataset. Because the English of the reviews left by people on IMDB isn't the same as the English of wikipedia, we'll need to adjust the parameters of our model by a little bit. Plus there might be some words that would be extremely common in the reviews dataset but would be barely present in wikipedia, and therefore might not be part of the vocabulary the model was trained on.

This is where the unlabelled data is going to be useful to us, as we can use it to fine-tune our model. Let's create our data object with the data block API (next line takes a few minutes).

In [6]:
processor = [
    OpenFileProcessor(),
    SPProcessor(lang="en", pre_rules=[], post_rules=[],
                max_sentence_len=30_720,
                sp_model = "/tf/main/nbs/poc/model.model",
                sp_vocab = "/tf/main/nbs/poc/model.vocab"
               )
]

In [None]:
data_lm = (
    TextList.from_folder(
        path, extensions={".java"}, processor = processor
    )
    #Inputs: all the text files in path
    .filter_by_folder(include=['sm_train', 'sm_valid'])
    #We may have other temp folders that contain text files
    #so we only keep what's in train and test
    .split_by_folder(train='sm_train', valid='sm_valid')
    .label_for_lm() 
    #We want to do a language model so we label accordingly
    .databunch(bs=bs)
)
data_lm.save('data_lm.pkl')

In [None]:
# data_lm = (TextList.from_folder(path, extensions={".java"},
#                                 processor = [OpenFileProcessor(),
#                                              SPProcessor(lang="en", pre_rules=[], post_rules=[],
#                                                          max_sentence_len=40_960,
#                                                          sp_model = "/tf/src/prototypes/model.model",
#                                                          sp_vocab = "/tf/src/prototypes/model.vocab"
#                                                         )])
#            #Inputs: all the text files in path
#             .filter_by_folder(include=['sm_train', 'sm_valid']) 
# #            #We may have other temp folders that contain text files so we only keep what's in train and test
#             .split_by_folder(train='sm_train', valid='sm_valid')
#            #We randomly split and keep 10% (10,000 reviews) for validation
#             .label_for_lm()           
#            #We want to do a language model so we label accordingly
#             .databunch(bs=bs))
# data_lm.save('data_lm.pkl')

We have to use a special kind of `TextDataBunch` for the language model, that ignores the labels (that's why we put 0 everywhere), will shuffle the texts at each epoch before concatenating them all together (only for training, we don't shuffle for the validation set) and will send batches that read that text in order with targets that are the next word in the sentence.

The line before being a bit long, we want to load quickly the final ids by using the following cell.

In [None]:
data_lm = load_data(path, 'data_lm.pkl', bs=bs)

In [None]:
len(data_lm.vocab.itos)

In [None]:
data_lm.show_batch()

We can then put this in a learner object very easily with a model loaded with the pretrained weights. They'll be downloaded the first time you'll execute the following line and stored in `~/.fastai/models/` (or elsewhere if you specified different paths in your config file).

In [None]:
learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.3, pretrained=False)

In [None]:
learn = language_model_learner(data_lm, TransformerXL, drop_mult=0.3)

In [None]:
learn.lr_find()

In [None]:
learn.recorder.plot()

In [None]:
learn.fit_one_cycle(1, 1.5e-2, moms=(0.8,0.75))

In [None]:
len(data_lm.train_ds)

In [None]:
learn.lr_find()

In [None]:
learn.recorder.plot()

In [None]:
learn.fit_one_cycle(1, 5e-2, moms=(0.8,0.75), pct_start = 0.02)

In [None]:
learn.recorder.plot_losses()

In [None]:
learn.save('fit_head')

In [None]:
learn.load('fit_head');

To complete the fine-tuning, we can then unfeeze and launch a new training.

In [None]:
learn.unfreeze()

In [None]:
learn.fit_one_cycle(10, 5e-4, moms=(0.8,0.7), pct_start = 0.02)

In [None]:
learn.unfreeze()

In [None]:
learn.fit_one_cycle(10, 5e-4, moms=(0.8,0.7), pct_start = 0.02)

In [None]:
learn.save('fine_tuned')

In [None]:
! curl -X POST -H 'Content-type: application/json' --data '{"text":"from: semeru tower 2\nstatus: finished training TransformerXL"}' https://hooks.slack.com/services/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

In [None]:
learn.recorder.plot_metrics()

How good is our model? Well let's try to see what it predicts after a few given words.

In [None]:
learn.load('fine_tuned');

In [None]:
TEXT = "public String get"
N_WORDS = 40
N_SENTENCES = 2

In [None]:
print("\n".join(learn.predict(TEXT, N_WORDS, temperature=0.75) for _ in range(N_SENTENCES)))

We have to save not only the model, but also its encoder, the part that's responsible for creating and updating the hidden state. For the next part, we don't care about the part that tries to guess the next word.

In [None]:
learn.save_encoder('fine_tuned_enc')

## Classifier

Now, we'll create a new data object that only grabs the labelled data and keeps those labels. Again, this line takes a bit of time.

In [None]:
path = untar_data(URLs.IMDB)

In [None]:
data_clas = (TextList.from_folder(
    path, extensions={".java"}, vocab=data_lm.vocab,
    processor = [OpenFileProcessor(), SPProcessor(lang="en",
                                                  sp_model = "/tf/src/prototypes/model.model",
                                                  sp_vocab = "/tf/src/prototypes/model.vocab"
                                                 )]
)
             .filter_by_folder(include=['sm_train', 'sm_valid']) 
             #grab all the text files in path
             .split_by_folder(train='sm_train', valid='sm_valid')
             #split by train and valid folder (that only keeps 'train' and 'test' so no need to filter)
             .label_from_folder(classes=['before', 'after'])
             #label them all with their folders
             .databunch(bs=bs))

data_clas.save('data_clas.pkl')

In [None]:
path = Path("/tf/data/datasets/raw/security_c++")
path.ls()

In [None]:
bs = 64

In [None]:
ItemList??

In [None]:
data_clas = TextList.from_csv(path, 'security-training.csv',
                                       text_cols = 'code', label_cols = 'label', tokenizer = SPProcessor(lang="en",
                                                  sp_model = "/tf/src/prototypes/model.model",
                                                  sp_vocab = "/tf/src/prototypes/model.vocab"
                                                 ))

In [None]:
TextList.from_csv?

In [None]:
data_clas = (
    TextList.from_csv(
        path, 'security-training.csv',
        cols=['code', 'label'],
        processor=SPProcessor(
            lang="en",
            sp_model = "/tf/main/nbs/poc/model.model",
            sp_vocab = "/tf/main/nbs/poc/model.vocab"
        )
    )
    .split_by_rand_pct()
    .label_from_df(cols=['label'])
    .databunch(bs=bs)
)
data_clas.vocab = data_lm.vocab

In [None]:
data_clas.show_batch()

In [None]:
data_clas.save('data_clas.pkl')

In [None]:
data_clas = load_data(path, 'data_clas.pkl', bs=bs)

In [None]:
data_clas.show_batch()

In [None]:
len(data_clas.train_ds)

In [None]:
path = Path("/tf/data/datasets/raw/raw_java/data00m_god-r")

We can then create a model to classify those reviews and load the encoder we saved before.

In [None]:
learn = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5, pretrained=False)
learn.load_encoder(path/'models/fine_tuned_enc')

In [None]:
learn.lr_find()

In [None]:
learn.recorder.plot()

In [None]:
learn.fit_one_cycle(1, 2e-1, moms=(0.8,0.7))

In [None]:
learn.recorder.plot_metrics()

In [None]:
learn.recorder.plot()

In [None]:
learn.save('first')

In [None]:
learn.load('first');

In [None]:
learn.freeze_to(-2)


In [None]:
learn.lr_find()

In [None]:
learn.recorder.plot()

In [None]:
learn.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2), moms=(0.8,0.7))

In [None]:
learn.save('second')

In [None]:
learn.load('second');

In [None]:
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3), moms=(0.8,0.7))

In [None]:
learn.save('third')

In [None]:
learn.load('third');

In [None]:
learn.unfreeze()
learn.fit_one_cycle(2, slice(1e-3/(2.6**4),1e-3), moms=(0.8,0.7))

In [None]:
learn.predict("I really loved that movie, it was awesome!")