In [1]:
import numpy as np
import pandas as pd
from mxnet.gluon import nn, rnn
from mxnet import gluon, autograd
import gluonnlp as nlp
from mxnet import nd
import mxnet as mx
import time
import itertools
import random

In [2]:
ctx = mx.gpu()
ctx

gpu(0)

In [3]:
bert_base, vocabulary = nlp.model.get_model('bert_12_768_12',
                 dataset_name='wiki_multilingual_cased',
                 pretrained=True, ctx=ctx, use_pooler=True,
                 use_decoder=False, use_classifier=False)

class BERTDataset(mx.gluon.data.Dataset):
    def __init__(self, dataset, sent_idx, label_idx, 
                 bert_tokenizer, max_len, pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, 
            pad=pad, pair=pair)
        sent_dataset = gluon.data.SimpleDataset(
            [[i[sent_idx],] for i in dataset]
        )
        self.sentences = sent_dataset.transform(transform)
        self.labels = gluon.data.SimpleDataset(
            [np.array(np.int32(i[label_idx])) for i in dataset])
    
    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))
    
    def __len__(self):
        return len(self.labels)

bert_tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=False)
max_len = 64

class BERTClassifier(nn.Block):
    def __init__(self,
                 bert,
                 num_classes=2,
                 dropout=None,
                 prefix=None,
                 params=None):
        super(BERTClassifier, self).__init__(prefix=prefix, params=params)
        self.bert = bert
        with self.name_scope():
            self.classifier = nn.HybridSequential(prefix=prefix)
            if dropout:
                self.classifier.add(nn.Dropout(rate=dropout))
            self.classifier.add(nn.Dense(units=num_classes))
            
    def forward(self, inputs, token_types, valid_length=None):
        _, pooler = self.bert(inputs, token_types, valid_length)
        return self.classifier(pooler)

In [4]:
import sys
mod = sys.modules[__name__]

In [8]:
i = 2
setattr(mod, 'dataset_train{}'.format(i), 
        nlp.data.TSVDataset(
    "./smishing_text/train_smishing{}.txt".format(i), 
    field_indices=[1,2], num_discard_samples=1))
setattr(mod, 'dataset_valid{}'.format(i),
        nlp.data.TSVDataset(
    "./smishing_text/valid_smishing{}.txt".format(i), 
    field_indices=[1,2], num_discard_samples=1))
    

setattr(mod, 'data_train{}'.format(i), 
        BERTDataset(
            getattr(mod, 'dataset_train{}'.format(i)), 
            0, 1, bert_tokenizer, max_len, True, False))
setattr(mod, 'data_valid{}'.format(i),
       BERTDataset(
           getattr(mod, 'dataset_valid{}'.format(i)), 
           0, 1, bert_tokenizer, max_len, True, False))
    

model_ = 'model{}'.format(i)
setattr(mod, model_,
        BERTClassifier(bert_base, num_classes=2, dropout=0.3))

# 분류 레이어만 초기화 한다. 
getattr(mod, model_).classifier.initialize(ctx=ctx)
getattr(mod, model_).hybridize()

# softmax cross entropy loss for classification
loss_function = gluon.loss.SoftmaxCELoss()

metric = mx.metric.Accuracy()

batch_size = 64
lr = 5e-5


setattr(mod, 'train_dataloader{}'.format(i),
        mx.gluon.data.DataLoader(
            getattr(mod, 'data_train{}'.format(i)), 
            batch_size=batch_size, num_workers=0))
setattr(mod, 'valid_dataloader{}'.format(i),
        mx.gluon.data.DataLoader(
            getattr(mod, 'data_valid{}'.format(i)), 
            batch_size=batch_size, num_workers=0))
    
log_interval = 4
num_epochs = 5


model_ = getattr(mod, 'model{}'.format(i))
setattr(mod, 'trainer{}'.format(i), gluon.Trainer(
    model_.collect_params(), 
    'bertadam',
    {'learning_rate': lr, 
     'epsilon': 1e-9, 
     'wd':0.01}))
# LayerNorm과 Bias에는 Weight Decay를 적용하지 않는다. 
iters = model_.collect_params('.*beta|.*gamma|.*bias').items()
for _, v in iters:
    v.wd_mult = 0.0
        

model_ = getattr(mod, 'model{}'.format(i))
setattr(mod, 'params{}'.format(i),
       [p for p in model_.collect_params().values() 
          if p.grad_req != 'null']) 

In [9]:
# Learning rate warmup을 위한 준비 
step_size = batch_size 
num_train_examples = len(getattr(mod, 'data_train{}'.format(i)))
num_train_steps = int(num_train_examples / step_size * num_epochs)
warmup_ratio = 0.1
num_warmup_steps = int(num_train_steps * warmup_ratio)
step_num = 0

In [10]:
def evaluate_accuracy(model, data_iter, ctx=ctx):
    acc = mx.metric.Accuracy()
    ix = 0
    for ix, (t,v,s, label) in enumerate(data_iter):
        token_ids = t.as_in_context(ctx)
        valid_length = v.as_in_context(ctx)
        segment_ids = s.as_in_context(ctx)
        label = label.as_in_context(ctx)
        output = model(token_ids, segment_ids, valid_length.astype('float32'))
        acc.update(preds=output, labels=label)
        if ix > 1000:
            break
        ix += 1
    return(acc.get()[1])

In [11]:
print('{}번째 모델'.format(i))
model = getattr(mod, 'model{}'.format(i))
trainer = getattr(mod, 'trainer{}'.format(i))
train_dataloader = getattr(mod, 'train_dataloader{}'.format(i))
valid_dataloader = getattr(mod, 'valid_dataloader{}'.format(i))
params = getattr(mod, 'params{}'.format(i))
for epoch_id in range(num_epochs):
    metric.reset()
    step_loss = 0
    for batch_id, (token_ids, 
                   valid_length, 
                   segment_ids, 
                   label) in enumerate(train_dataloader):
        step_num += 1
        if step_num < num_warmup_steps:
            new_lr = lr * step_num / num_warmup_steps
        else:
            offset = (step_num - num_warmup_steps) * lr / (
                num_train_steps - num_warmup_steps)
            new_lr = lr - offset
        trainer.set_learning_rate(new_lr)
        with mx.autograd.record():
            # load data to GPU
            token_ids = token_ids.as_in_context(ctx)
            valid_length = valid_length.as_in_context(ctx)
            segment_ids = segment_ids.as_in_context(ctx)
            label = label.as_in_context(ctx)

            # forward computation
            out = model(token_ids, segment_ids, 
                        valid_length.astype('float32'))
            ls = loss_function(out, label).mean()

        # backward computation
        ls.backward()
        trainer.allreduce_grads()
        nlp.utils.clip_grad_global_norm(params, 1)
        trainer.update(token_ids.shape[0])

        step_loss += ls.asscalar()
        metric.update([label], [out])
        if (batch_id + 1) % (50) == 0:
            print('[Epoch {} Batch {}/{}] loss={:.4f}, lr={:.10f}, acc={:.3f}'
                         .format(epoch_id + 1, batch_id + 1, 
                                 len(train_dataloader),
                                 step_loss / log_interval,
                                 trainer.learning_rate, 
                                 metric.get()[1]))
            step_loss = 0
    valid_acc = evaluate_accuracy(model, valid_dataloader, ctx)
    print('Valid acc : {}\n'.format(valid_acc))

2번째 모델
[Epoch 1 Batch 50/482] loss=3.5441, lr=0.0000104167, acc=0.875
[Epoch 1 Batch 100/482] loss=1.0443, lr=0.0000208333, acc=0.925
[Epoch 1 Batch 150/482] loss=1.1543, lr=0.0000312500, acc=0.943
[Epoch 1 Batch 200/482] loss=0.7825, lr=0.0000416667, acc=0.953
[Epoch 1 Batch 250/482] loss=1.0256, lr=0.0000497692, acc=0.958
[Epoch 1 Batch 300/482] loss=0.6390, lr=0.0000486150, acc=0.963
[Epoch 1 Batch 350/482] loss=0.7012, lr=0.0000474608, acc=0.966
[Epoch 1 Batch 400/482] loss=1.0592, lr=0.0000463066, acc=0.969
[Epoch 1 Batch 450/482] loss=0.8977, lr=0.0000451524, acc=0.970
Valid acc : 0.9880550506361984

[Epoch 2 Batch 50/482] loss=0.7283, lr=0.0000432595, acc=0.989
[Epoch 2 Batch 100/482] loss=0.3041, lr=0.0000421053, acc=0.993
[Epoch 2 Batch 150/482] loss=0.2937, lr=0.0000409511, acc=0.994
[Epoch 2 Batch 200/482] loss=0.3019, lr=0.0000397969, acc=0.994
[Epoch 2 Batch 250/482] loss=0.3084, lr=0.0000386427, acc=0.994
[Epoch 2 Batch 300/482] loss=0.3414, lr=0.0000374885, acc=0.994
[Ep

In [12]:
getattr(mod, 'model{}'.format(i)).save_parameters('model{}.params'.format(i))

In [None]:
# new_net = BERTClassifier(bert_base, num_classes=2, dropout=0.3)
# new_net.load_parameters('model2.params', ctx=ctx)