# ULMFiT on complete Cookie descriptions

With symbols and annotations.

In [None]:
import pandas as pd
from fastai.text import *

In [None]:
pitt_path = Path('../data/Pitt')
model_path = pitt_path.parent/'models'

In [None]:
pitt_df = pd.read_csv(model_path/'pitt-cookie-complete.csv')
pitt_df.head()

In [None]:
len(pitt_df)

## fastai API

In [None]:
bs = 64    # was: 128

To create the Language Model, we use all texts and keep 10% for validation.

In [None]:
# Use 'clean' for clean text or 'text' for raw with corrections and annotations.
data_lm = (TextList.from_df(pitt_df, path=model_path, cols='clean')
                .split_by_rand_pct(0.1)
                .label_for_lm()
                .databunch(bs=bs)
       )

In [None]:
data_lm.show_batch()

In [None]:
torch.cuda.empty_cache()

In [None]:
learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.3)

In [None]:
learn.lr_find()

In [None]:
learn.recorder.plot()

In [None]:
lr = 1e-1
learn.fit_one_cycle(1, lr, moms=(0.8,0.7))

In [None]:
learn.save('cookie_fit_head')

In [None]:
learn = learn.load('cookie_fit_head')

In [None]:
learn.unfreeze()

In [None]:
learn.fit_one_cycle(4, lr*0.1, moms=(0.8,0.7))

We might be overfitting.

In [None]:
learn.save('cookie_fine_tuned')

In [None]:
learn.save_encoder('cookie_fine_tuned_enc')

In [None]:
learn.load_encoder('cookie_fine_tuned_enc')

## Classification

In [None]:
data_clas = (TextList.from_df(pitt_df, path=model_path, cols='clean', vocab=data_lm.vocab)
                .split_by_rand_pct(valid_pct=0.2)
                .label_from_df(cols='group')
                .databunch(bs=bs)
              )

In [None]:
data_clas.show_batch()

In [None]:
learn_clas = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5)
learn_clas.load_encoder('cookie_fine_tuned_enc')

In [None]:
learn_clas.lr_find()

In [None]:
learn_clas.recorder.plot(skip_end=10, suggestion=True)

In [None]:
torch.cuda.empty_cache()

In [None]:
lr_clas = 2e-3     //2e-3

Save best model (according to accuracy)
I tried 'valid_loss' but it gave me an error. Hopefully they correlate.

In [None]:
from fastai.callbacks import *

In [None]:
learn_clas.fit_one_cycle(20,
                         lr_clas, moms=(0.8, 0.7),
                         callbacks=[SaveModelCallback(learn_clas, monitor='accuracy', name='cookie_first_best')])


The last epochs have less validation error. Let's use them for now.

In [None]:
learn_clas.save('cookie_first')

In [None]:
# Load best
#learn_clas = learn_clas.load('cookie_first_best')

In [None]:
del(learn)

In [None]:
import gc
gc.collect()
gc.get_count()

In [None]:
torch.cuda.empty_cache()

### Freeze the stem and keep tuning (all epochs)

In [None]:
learn_clas.freeze_to(-2)
learn_clas.fit_one_cycle(3, slice(lr_clas/(2.6**4),lr_clas), moms=(0.8,0.7))

In [None]:
#learn_clas.save('second')

In [None]:
# Load previous version!
learn_clas = learn_clas.load('cookie_first')

In [None]:
learn_clas.freeze_to(-3)
learn_clas.fit_one_cycle(10,
                         slice(lr_clas*0.1/(2.6**4), lr_clas*0.1),
                         moms=(0.8,0.7),
                        callbacks=[SaveModelCallback(learn_clas, monitor='accuracy', name='cookie_third_best')]
)

In [None]:
learn_clas.save('cookie_third')

In [None]:
learn_clas = learn_clas.load('cookie_third_best')

### Unfreeze and fit a bit more

In [None]:
torch.cuda.empty_cache()

In [None]:
learn_clas.unfreeze()
learn_clas.fit_one_cycle(10,
                         slice((lr_clas/20)/(2.6**4), lr_clas/20),
                         moms=(0.8,0.7),
                         callbacks=[SaveModelCallback(learn_clas, monitor='accuracy', name='cookie_fourth_best')]
)

In [None]:
learn_clas.save('cookie_fourth')

In [None]:
learn_clas = learn_clas.load('cookie_fourth_best')

## ToDo: try with the `best` models in the steps above

He tenido accuracies ~90% con bs=128 pero me petaba en el último paso. :(

## Prediction

In [None]:
learn_clas.predict(pitt_df.iloc[0].clean)

In [None]:
pitt_df.iloc[0].clean

In [None]:
import matplotlib.cm as cm

txt_ci = TextClassificationInterpretation.from_learner(learn_clas)
txt_ci.show_intrinsic_attention(pitt_df.iloc[0].clean, cmap=cm.Purples)

In [None]:
txt_ci.show_intrinsic_attention(pitt_df.iloc[550].clean, cmap=cm.Purples)

## Export

In [None]:
learn_clas.model[0]

I suppose I have to export the `model` using `torch.onnx.export`, but I have no idea how to supply the inputs. I guess we need to:

* Export the language model vocab.
* Perform tokenization in the iPhone app using that vocab.
* Vectorize - how?


The input to the `forward` function of `MultiBatchEncoder` is a `LongTensor` (64-bit integer) of size `(batchSize, sentenceLength)`, I think.

Assume max. sentence length is 70*20 = 1400

In [None]:
from torch.autograd import Variable

# We need an input to go through the model in order to be able to export it
max_len = 70*20
t = Variable(torch.LongTensor(1, max_len))

In [None]:
input_names = [ "sentence" ]
output_names = [ "prediction" ]

In [None]:
# Run conversion in CPU
model = learn_clas.model
torch.save(model, 'ulmfit.model')
model = torch.load('ulmfit.model', map_location='cpu')

In [None]:
torch.onnx.export(model, t, "ulmfit.onnx", verbose=True, input_names=input_names, output_names=output_names)