In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import torch
from fastai import *
from fastai.text import *

In [2]:
# bs=48
# bs=24
bs=128

In [3]:
torch.cuda.set_device(0) # Device: 1080Ti

In [4]:
data_path = Config.data_path()

In [23]:
name = 'viwiki'
path = Path('/home/dle/vi-lm')
lm_fns = ['vi_wt', 'vi_wt_vocab']

## Vietnamese wikipedia model

### Create pretrained model

In [9]:
dest = path/'viwiki'

In [10]:
data = (TextList.from_folder(dest)
            .split_by_rand_pct(0.1, seed=42)
            .label_for_lm()           
            .databunch(bs=bs, num_workers=1))

data.save('vi_databunch')
len(data.vocab.itos),len(data.train_ds)

(60000, 64305)

In [None]:
data = load_data(path, 'vi_databunch', bs=bs)

In [11]:
learn = language_model_learner(data, AWD_LSTM, drop_mult=0.5, pretrained=False).to_fp16()

In [12]:
lr = 1e-2
lr *= bs/48  # Scale learning rate by batch size

In [13]:
learn.unfreeze()
learn.fit_one_cycle(10, lr, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,3.48749,3.520983,0.360647,1:29:59
1,3.566643,3.583175,0.352018,1:29:30
2,3.574274,3.65013,0.344561,1:29:14
3,3.617446,3.621188,0.347692,1:29:04
4,3.552341,3.59358,0.349983,1:29:06
5,3.451556,3.47802,0.363552,1:28:54
6,3.449163,3.388254,0.373088,1:29:08
7,3.345643,3.296472,0.383724,1:29:17
8,3.295935,3.215711,0.394262,1:30:01
9,3.307131,3.178168,0.399416,1:30:25


In [21]:
mdl_path/lm_fns[1]

PosixPath('models/vi_wt_vocab')

Save the pretrained model and vocab:

In [24]:
mdl_path = path/'models'
mdl_path.mkdir(exist_ok=True)
learn.to_fp32().save(mdl_path/lm_fns[0], with_opt=False)
learn.data.vocab.save(mdl_path/(lm_fns[1] + '.pkl'))

## Vietnamese sentiment analysis

### Fine-tune language model

In [None]:
train_df = pd.read_csv(path/'train.csv')
train_df.loc[pd.isna(train_df.comment),'comment']='NA'
train_df.head()

In [None]:
test_df = pd.read_csv(path/'test.csv')
test_df.loc[pd.isna(test_df.comment),'comment']='NA'
test_df.head()

In [None]:
df = pd.concat([train_df,test_df], sort=False)

In [None]:
data_lm = (TextList.from_df(df, path, cols='comment')
    .split_by_rand_pct(0.1, seed=42)
    .label_for_lm()           
    .databunch(bs=bs, num_workers=1))

In [None]:
learn_lm = language_model_learner(data_lm, AWD_LSTM, pretrained_fnames=lm_fns, drop_mult=1.0)

In [None]:
lr = 1e-3
lr *= bs/48

In [None]:
learn_lm.fit_one_cycle(2, lr*10, moms=(0.8,0.7))

In [None]:
learn_lm.unfreeze()
learn_lm.fit_one_cycle(8, lr, moms=(0.8,0.7))

In [None]:
learn_lm.save(f'{lang}fine_tuned')
learn_lm.save_encoder(f'{lang}fine_tuned_enc')

### Classifier

In [None]:
data_clas = (TextList.from_df(train_df, path, vocab=data_lm.vocab, cols='comment')
    .split_by_rand_pct(0.1, seed=42)
    .label_from_df(cols='label')
    .databunch(bs=bs, num_workers=1))

data_clas.save(f'{lang}_textlist_class')

In [None]:
data_clas = load_data(path, f'{lang}_textlist_class', bs=bs, num_workers=1)

In [None]:
from sklearn.metrics import f1_score

@np_func
def f1(inp,targ): return f1_score(targ, np.argmax(inp, axis=-1))

In [None]:
learn_c = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5, metrics=[accuracy,f1]).to_fp16()
learn_c.load_encoder(f'{lang}fine_tuned_enc')
learn_c.freeze()

In [None]:
lr=2e-2
lr *= bs/48

In [None]:
learn_c.fit_one_cycle(2, lr, moms=(0.8,0.7))

In [None]:
learn_c.fit_one_cycle(2, lr, moms=(0.8,0.7))

In [None]:
learn_c.freeze_to(-2)
learn_c.fit_one_cycle(2, slice(lr/(2.6**4),lr), moms=(0.8,0.7))

In [None]:
learn_c.freeze_to(-3)
learn_c.fit_one_cycle(2, slice(lr/2/(2.6**4),lr/2), moms=(0.8,0.7))

In [None]:
learn_c.unfreeze()
learn_c.fit_one_cycle(1, slice(lr/10/(2.6**4),lr/10), moms=(0.8,0.7))

In [None]:
learn_c.save('vi_clas')