In [1]:
import numpy as np
import pandas as pd
from mxnet.gluon import nn, rnn
from mxnet import gluon, autograd
import gluonnlp as nlp
from mxnet import nd
import mxnet as mx
import time
import itertools
import random

In [2]:
ctx = mx.gpu()
ctx

gpu(0)

In [3]:
bert_base, vocabulary = nlp.model.get_model('bert_12_768_12',
                 dataset_name='wiki_multilingual_cased',
                 pretrained=True, ctx=ctx, use_pooler=True,
                 use_decoder=False, use_classifier=False)

class BERTDataset(mx.gluon.data.Dataset):
    def __init__(self, dataset, sent_idx, label_idx, 
                 bert_tokenizer, max_len, pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, 
            pad=pad, pair=pair)
        sent_dataset = gluon.data.SimpleDataset(
            [[i[sent_idx],] for i in dataset]
        )
        self.sentences = sent_dataset.transform(transform)
        self.labels = gluon.data.SimpleDataset(
            [np.array(np.int32(i[label_idx])) for i in dataset])
    
    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))
    
    def __len__(self):
        return len(self.labels)

bert_tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=False)
max_len = 64

class BERTClassifier(nn.Block):
    def __init__(self,
                 bert,
                 num_classes=2,
                 dropout=None,
                 prefix=None,
                 params=None):
        super(BERTClassifier, self).__init__(prefix=prefix, params=params)
        self.bert = bert
        with self.name_scope():
            self.classifier = nn.HybridSequential(prefix=prefix)
            if dropout:
                self.classifier.add(nn.Dropout(rate=dropout))
            self.classifier.add(nn.Dense(units=num_classes))
            
    def forward(self, inputs, token_types, valid_length=None):
        _, pooler = self.bert(inputs, token_types, valid_length)
        return self.classifier(pooler)

In [4]:
import sys
mod = sys.modules[__name__]

In [5]:
i = 1
setattr(mod, 'dataset_train{}'.format(i), 
        nlp.data.TSVDataset(
    "./smishing_text/train_smishing{}.txt".format(i), 
    field_indices=[1,2], num_discard_samples=1))
setattr(mod, 'dataset_valid{}'.format(i),
        nlp.data.TSVDataset(
    "./smishing_text/valid_smishing{}.txt".format(i), 
    field_indices=[1,2], num_discard_samples=1))
    

setattr(mod, 'data_train{}'.format(i), 
        BERTDataset(
            getattr(mod, 'dataset_train{}'.format(i)), 
            0, 1, bert_tokenizer, max_len, True, False))
setattr(mod, 'data_valid{}'.format(i),
       BERTDataset(
           getattr(mod, 'dataset_valid{}'.format(i)), 
           0, 1, bert_tokenizer, max_len, True, False))
    

model_ = 'model{}'.format(i)
setattr(mod, model_,
        BERTClassifier(bert_base, num_classes=2, dropout=0.3))

# 분류 레이어만 초기화 한다. 
getattr(mod, model_).classifier.initialize(ctx=ctx)
getattr(mod, model_).hybridize()

# softmax cross entropy loss for classification
loss_function = gluon.loss.SoftmaxCELoss()

metric = mx.metric.Accuracy()

batch_size = 64
lr = 5e-5


setattr(mod, 'train_dataloader{}'.format(i),
        mx.gluon.data.DataLoader(
            getattr(mod, 'data_train{}'.format(i)), 
            batch_size=batch_size, num_workers=0))
setattr(mod, 'valid_dataloader{}'.format(i),
        mx.gluon.data.DataLoader(
            getattr(mod, 'data_valid{}'.format(i)), 
            batch_size=batch_size, num_workers=0))
    
log_interval = 4
num_epochs = 5


model_ = getattr(mod, 'model{}'.format(i))
setattr(mod, 'trainer{}'.format(i), gluon.Trainer(
    model_.collect_params(), 
    'bertadam',
    {'learning_rate': lr, 
     'epsilon': 1e-9, 
     'wd':0.01}))
# LayerNorm과 Bias에는 Weight Decay를 적용하지 않는다. 
iters = model_.collect_params('.*beta|.*gamma|.*bias').items()
for _, v in iters:
    v.wd_mult = 0.0
        

model_ = getattr(mod, 'model{}'.format(i))
setattr(mod, 'params{}'.format(i),
       [p for p in model_.collect_params().values() 
          if p.grad_req != 'null']) 

In [6]:
# Learning rate warmup을 위한 준비 
step_size = batch_size 
num_train_examples = len(getattr(mod, 'data_train{}'.format(i)))
num_train_steps = int(num_train_examples / step_size * num_epochs)
warmup_ratio = 0.1
num_warmup_steps = int(num_train_steps * warmup_ratio)
step_num = 0

In [7]:
def evaluate_accuracy(model, data_iter, ctx=ctx):
    acc = mx.metric.Accuracy()
    ix = 0
    for ix, (t,v,s, label) in enumerate(data_iter):
        token_ids = t.as_in_context(ctx)
        valid_length = v.as_in_context(ctx)
        segment_ids = s.as_in_context(ctx)
        label = label.as_in_context(ctx)
        output = model(token_ids, segment_ids, valid_length.astype('float32'))
        acc.update(preds=output, labels=label)
        if ix > 1000:
            break
        ix += 1
    return(acc.get()[1])

In [8]:
print('{}번째 모델'.format(i))
model = getattr(mod, 'model{}'.format(i))
trainer = getattr(mod, 'trainer{}'.format(i))
train_dataloader = getattr(mod, 'train_dataloader{}'.format(i))
valid_dataloader = getattr(mod, 'valid_dataloader{}'.format(i))
params = getattr(mod, 'params{}'.format(i))
for epoch_id in range(num_epochs):
    metric.reset()
    step_loss = 0
    for batch_id, (token_ids, 
                   valid_length, 
                   segment_ids, 
                   label) in enumerate(train_dataloader):
        step_num += 1
        if step_num < num_warmup_steps:
            new_lr = lr * step_num / num_warmup_steps
        else:
            offset = (step_num - num_warmup_steps) * lr / (
                num_train_steps - num_warmup_steps)
            new_lr = lr - offset
        trainer.set_learning_rate(new_lr)
        with mx.autograd.record():
            # load data to GPU
            token_ids = token_ids.as_in_context(ctx)
            valid_length = valid_length.as_in_context(ctx)
            segment_ids = segment_ids.as_in_context(ctx)
            label = label.as_in_context(ctx)

            # forward computation
            out = model(token_ids, segment_ids, 
                        valid_length.astype('float32'))
            ls = loss_function(out, label).mean()

        # backward computation
        ls.backward()
        trainer.allreduce_grads()
        nlp.utils.clip_grad_global_norm(params, 1)
        trainer.update(token_ids.shape[0])

        step_loss += ls.asscalar()
        metric.update([label], [out])
        if (batch_id + 1) % (50) == 0:
            print('[Epoch {} Batch {}/{}] loss={:.4f}, lr={:.10f}, acc={:.3f}'
                         .format(epoch_id + 1, batch_id + 1, 
                                 len(train_dataloader),
                                 step_loss / log_interval,
                                 trainer.learning_rate, 
                                 metric.get()[1]))
            step_loss = 0
    valid_acc = evaluate_accuracy(model, valid_dataloader, ctx)
    print('Valid acc : {}\n'.format(valid_acc))

1번째 모델
[Epoch 1 Batch 50/482] loss=3.5184, lr=0.0000104167, acc=0.859
[Epoch 1 Batch 100/482] loss=2.5072, lr=0.0000208333, acc=0.902
[Epoch 1 Batch 150/482] loss=0.5082, lr=0.0000312500, acc=0.931
[Epoch 1 Batch 200/482] loss=1.4466, lr=0.0000416667, acc=0.940
[Epoch 1 Batch 250/482] loss=1.4656, lr=0.0000497692, acc=0.947
[Epoch 1 Batch 300/482] loss=0.6288, lr=0.0000486150, acc=0.954
[Epoch 1 Batch 350/482] loss=0.8557, lr=0.0000474608, acc=0.959
[Epoch 1 Batch 400/482] loss=1.1062, lr=0.0000463066, acc=0.961
[Epoch 1 Batch 450/482] loss=0.4469, lr=0.0000451524, acc=0.965
Valid acc : 0.9957153985977668

[Epoch 2 Batch 50/482] loss=0.2195, lr=0.0000432595, acc=0.997
[Epoch 2 Batch 100/482] loss=0.2120, lr=0.0000421053, acc=0.997
[Epoch 2 Batch 150/482] loss=0.2077, lr=0.0000409511, acc=0.997
[Epoch 2 Batch 200/482] loss=0.2402, lr=0.0000397969, acc=0.996
[Epoch 2 Batch 250/482] loss=0.1404, lr=0.0000386427, acc=0.997
[Epoch 2 Batch 300/482] loss=0.1381, lr=0.0000374885, acc=0.997
[Ep

### construct dataloader

In [8]:
dataset_valid = nlp.data.TSVDataset(
    "./smishing_text/train_smishing1.txt".format(i), 
    field_indices=[1,2], num_discard_samples=1)

data_valid = BERTDataset(dataset_valid,
        0, 1, bert_tokenizer, max_len, True, False)

valid_dataloader2 = mx.gluon.data.DataLoader(
        data_valid, batch_size=64, num_workers=0)

In [10]:
# model = nets[1]
acc = mx.metric.Accuracy()
result = []
for ix, (t, v, s, label) in enumerate(valid_dataloader2):
    if ix % 10 == 0:
        print('\r{}'.format(ix), end='')
    token_ids = t.as_in_context(ctx)
    valid_length = v.as_in_context(ctx)
    segment_ids = s.as_in_context(ctx)
    label = label.as_in_context(ctx)
    output = model(token_ids, segment_ids, 
                   valid_length.astype('float32'))
    result.append([output, label])
    acc.update(preds=output, labels=label)

480

In [11]:
acc.get()

('accuracy', 0.9999350733670952)

In [19]:
mx.ndarray.argmax(result[0][0].softmax(), axis=1)


[1. 0. 0. 1. 1. 1. 1. 0. 1. 0. 0. 0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1.
 1. 1. 0. 1. 0. 0. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 0. 1. 0. 0. 1.
 0. 0. 1. 1. 1. 0. 0. 0. 1. 1. 0. 1. 0. 1. 1. 0.]
<NDArray 64 @gpu(0)>

In [20]:
result[0][1]


[1 0 0 1 1 1 1 0 1 0 0 0 1 1 1 1 1 1 0 1 1 0 1 1 1 1 0 1 0 0 1 0 1 1 1 0 1
 1 1 1 1 1 0 0 1 0 0 1 0 0 1 1 1 0 0 0 1 1 0 1 0 1 1 0]
<NDArray 64 @gpu(0)>

In [21]:
model.save_parameters('model_test.params'.format(i))

In [16]:
result = {}
for i in range(1, 15):
    new_net = BERTClassifier(bert_base, num_classes=2, dropout=.3)
    new_net.load_parameters(
        'model{}.params'.format(i), ctx=ctx)
    acc = mx.metric.Accuracy()
    res = []
    for ix, (t, v, s, label) in enumerate(valid_dataloader2):
        if ix % 10 == 0:
            print('\r{}'.format(ix), end='')
        token_ids = t.as_in_context(ctx)
        valid_length = v.as_in_context(ctx)
        segment_ids = s.as_in_context(ctx)
        label = label.as_in_context(ctx)
        output = new_net(token_ids, segment_ids, 
                       valid_length.astype('float32'))
        res.append([output, label])
        acc.update(preds=output, labels=label)
    result[i] = (res, acc)

480

In [34]:
result[14][1].get()

('accuracy', 0.9991884170886898)

In [39]:
len(result[2][0])

482

In [51]:
res = []
for i in range(482):
    res.append(
        mx.ndarray.argmax(result[2][0][i][0].softmax(), axis=1)
    )

### test셋 적용해보기

In [53]:
path = '../dacon문자스미싱/filedown (2)'

test = pd.read_csv(path + '/public_test.csv')

In [54]:
from eunjeon import Mecab

mecab = Mecab()

In [56]:
test = test.set_index('id')
%time text = test.text.map(lambda x : mecab.morphs(x))

Wall time: 937 ms


In [58]:
stopwords = ['XXX', '.', '을', '를', '이', '가', 
             '-', '(', ')', ':', '!', '?', ')-', 
             '.-', 'ㅡ', 'XXXXXX', '..', '.(', '은', '는'] #필요없는 단어 리스트
%time text = text.map(lambda x : [_word for _word in x if _word not in stopwords])

Wall time: 44.4 ms


In [59]:
%time text = text.map(lambda x : ''.join(x))

Wall time: 8 ms


In [60]:
from chatspace import ChatSpace

spacer = ChatSpace()

Loading JIT Compiled ChatSpace Model


In [61]:
%time text = text.map(lambda x : spacer.space(x))

Wall time: 1min 20s


In [62]:
import pickle
with open('(test)spacing_text_except_stopwords.pkl', 'wb') as f:
    pickle.dump(text, f, protocol=pickle.HIGHEST_PROTOCOL)

In [66]:
import re
text = text.map(lambda x : re.sub('X', '', x))

In [69]:
pd.DataFrame(text).to_csv(
    './smishing_text/test_smishing.txt', sep='\t')

In [72]:
class BERTDataset_test(mx.gluon.data.Dataset):
    def __init__(self, dataset, sent_idx, 
                 bert_tokenizer, max_len, pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, 
            pad=pad, pair=pair)
        sent_dataset = gluon.data.SimpleDataset(
            [[i[sent_idx],] for i in dataset]
        )
        self.sentences = sent_dataset.transform(transform)
    
    def __getitem__(self, i):
        return (self.sentences[i])
    
    def __len__(self):
        return len(self.sentences)

In [74]:
dataset_test = nlp.data.TSVDataset(
    "./smishing_text/test_smishing.txt".format(i), 
    field_indices=[1], num_discard_samples=1)

data_test = BERTDataset_test(dataset_test,
        0, bert_tokenizer, max_len, True, False)

test_dataloader = mx.gluon.data.DataLoader(
        data_test, batch_size=64, num_workers=0)

In [78]:
result = {}
for i in range(1, 15):
    new_net = BERTClassifier(bert_base, num_classes=2, dropout=.3)
    if i == 1:
            new_net.load_parameters(
            'model{}.params'.format('_test'), ctx=ctx)
    else:
            new_net.load_parameters(
            'model{}.params'.format(i), ctx=ctx)
    res = []
    for ix, (t, v, s) in enumerate(test_dataloader):
        if ix % 10 == 0:
            print('\r{}\t{}'.format(i, ix), end='')
        token_ids = t.as_in_context(ctx)
        valid_length = v.as_in_context(ctx)
        segment_ids = s.as_in_context(ctx)
        output = new_net(token_ids, segment_ids, 
                       valid_length.astype('float32'))
        res.append(output)
    result[i] = res

14	20

In [111]:
results = {}
for model_ix in range(1, 15):
    res = []
    for i in range(26):
        res.append(result[model_ix][i].softmax().asnumpy())
    res = np.concatenate(res)
    results[model_ix] = res

In [157]:
# Soft Voting
preds = np.concatenate(
    [results[model_ix].reshape(-1, 2, 1) 
     for model_ix in range(1, 15)], axis=2).mean(axis=2)
# Softmax
preds = preds / preds.sum(axis=1).reshape(-1, 1)

In [160]:
submission = pd.read_csv('../dacon문자스미싱/submission_제출양식.csv')

In [202]:
submission['smishing'] = preds[:, 1]

In [203]:
submission.smishing.map(
    lambda x : np.random.choice(2, p=(x, 1-x))).value_counts()

1    1478
0     148
Name: smishing, dtype: int64

In [204]:
submission

Unnamed: 0,id,smishing
0,340000,0.000504
1,340001,0.000505
2,340002,0.000489
3,340003,0.000477
4,340004,0.000479
...,...,...
1621,341621,0.000483
1622,341622,0.000475
1623,341623,0.000475
1624,341624,0.000485


In [205]:
submission.set_index('id').to_csv('submission_bert_ensemble.csv')