In [1]:
import numpy as np
import pandas as pd
from mxnet.gluon import nn, rnn
from mxnet import gluon, autograd
import gluonnlp as nlp
from mxnet import nd
import mxnet as mx
import time
import itertools
import random

In [2]:
ctx = mx.gpu()
ctx

gpu(0)

In [3]:
bert_base, vocabulary = nlp.model.get_model('bert_12_768_12',
                 dataset_name='wiki_multilingual_cased',
                 pretrained=True, ctx=ctx, use_pooler=True,
                 use_decoder=False, use_classifier=False)

In [4]:
ds = gluon.data.SimpleDataset([['나 보기가 역겨워', '김소월']])
tok = nlp.data.BERTTokenizer(vocab=vocabulary, lower=False)
trans = nlp.data.BERTSentenceTransform(tok, max_seq_length=10)

[print(i) for i in list(ds.transform(trans))[0]]

[    2  8982  9356 47869  9566     3  8935 22333 38851     3]
10
[0 0 0 0 0 0 1 1 1 1]


[None, None, None]

In [5]:
dataset_train = nlp.data.TSVDataset("./nsmc/ratings_train.txt", field_indices=[1,2], num_discard_samples=1)
dataset_test = nlp.data.TSVDataset("./nsmc/ratings_test.txt", field_indices=[1,2], num_discard_samples=1)

In [6]:
train = nlp.data.TSVDataset('./train_smishing.txt', field_indices=[1,2], num_discard_samples=1)

In [7]:
class BERTDataset(mx.gluon.data.Dataset):
    def __init__(self, dataset, sent_idx, label_idx, 
                 bert_tokenizer, max_len, pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, 
            pad=pad, pair=pair)
        sent_dataset = gluon.data.SimpleDataset(
            [[i[sent_idx],] for i in dataset]
        )
        self.sentences = sent_dataset.transform(transform)
        self.labels = gluon.data.SimpleDataset(
            [np.array(np.int32(i[label_idx])) for i in dataset])
    
    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))
    
    def __len__(self):
        return len(self.labels)

In [8]:
bert_tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=False)
max_len = 64

In [9]:
data_train = BERTDataset(dataset_train, 0, 1, bert_tokenizer, max_len, True, False)
data_test = BERTDataset(dataset_test, 0, 1, bert_tokenizer, max_len, True, False)

In [10]:
class BERTClassifier(nn.Block):
    def __init__(self,
                 bert,
                 num_classes=2,
                 dropout=None,
                 prefix=None,
                 params=None):
        super(BERTClassifier, self).__init__(prefix=prefix, params=params)
        self.bert = bert
        with self.name_scope():
            self.classifier = nn.HybridSequential(prefix=prefix)
            if dropout:
                self.classifier.add(nn.Dropout(rate=dropout))
            self.classifier.add(nn.Dense(units=num_classes))
            
    def forward(self, inputs, token_types, valid_length=None):
        _, pooler = self.bert(inputs, token_types, valid_length)
        return self.classifier(pooler)

In [11]:
model = BERTClassifier(bert_base, num_classes=2, dropout=0.3)
# 분류 레이어만 초기화 한다. 
model.classifier.initialize(ctx=ctx)
model.hybridize()

# softmax cross entropy loss for classification
loss_function = gluon.loss.SoftmaxCELoss()

metric = mx.metric.Accuracy()

In [12]:
batch_size = 64
lr = 5e-5

train_dataloader = mx.gluon.data.DataLoader(data_train, batch_size=batch_size, num_workers=0)
test_dataloader = mx.gluon.data.DataLoader(data_test, batch_size=batch_size, num_workers=0)

In [13]:
trainer = gluon.Trainer(model.collect_params(), 'bertadam',
                        {'learning_rate': lr, 
                         'epsilon': 1e-9, 
                         'wd':0.01})

log_interval = 4
num_epochs = 4

In [14]:
# LayerNorm과 Bias에는 Weight Decay를 적용하지 않는다. 
for _, v in model.collect_params('.*beta|.*gamma|.*bias').items():
    v.wd_mult = 0.0
params = [
    p for p in model.collect_params().values() if p.grad_req != 'null'
]

In [15]:
len(params)

201

In [16]:
def evaluate_accuracy(model, data_iter, ctx=ctx):
    acc = mx.metric.Accuracy()
    i = 0
    for i, (t,v,s, label) in enumerate(data_iter):
        token_ids = t.as_in_context(ctx)
        valid_length = v.as_in_context(ctx)
        segment_ids = s.as_in_context(ctx)
        label = label.as_in_context(ctx)
        output = model(token_ids, segment_ids, valid_length.astype('float32'))
        acc.update(preds=output, labels=label)
        if i > 1000:
            break
        i += 1
    return(acc.get()[1])

In [17]:
# Learning rate warmup을 위한 준비 
step_size = batch_size 
num_train_examples = len(data_train)
num_train_steps = int(num_train_examples / step_size * num_epochs)
warmup_ratio = 0.1
num_warmup_steps = int(num_train_steps * warmup_ratio)
step_num = 0

In [18]:
for epoch_id in range(num_epochs):
    metric.reset()
    step_loss = 0
    for batch_id, (token_ids, 
                   valid_length, 
                   segment_ids, 
                   label) in enumerate(train_dataloader):
        step_num += 1
        if step_num < num_warmup_steps:
            new_lr = lr * step_num / num_warmup_steps
        else:
            offset = (step_num - num_warmup_steps) * lr / (
                num_train_steps - num_warmup_steps)
            new_lr = lr - offset
        trainer.set_learning_rate(new_lr)
        with mx.autograd.record():
            # load data to GPU
            token_ids = token_ids.as_in_context(ctx)
            valid_length = valid_length.as_in_context(ctx)
            segment_ids = segment_ids.as_in_context(ctx)
            label = label.as_in_context(ctx)

            # forward computation
            out = model(token_ids, segment_ids, 
                        valid_length.astype('float32'))
            ls = loss_function(out, label).mean()

        # backward computation
        ls.backward()
        trainer.allreduce_grads()
        nlp.utils.clip_grad_global_norm(params, 1)
        trainer.update(token_ids.shape[0])

        step_loss += ls.asscalar()
        metric.update([label], [out])
        if (batch_id + 1) % (50) == 0:
            print('\r[Epoch {} Batch {}/{}] loss={:.4f}, lr={:.10f}, acc={:.3f}'
                         .format(epoch_id + 1, batch_id + 1, 
                                 len(train_dataloader),
                                 step_loss / log_interval,
                                 trainer.learning_rate, 
                                 metric.get()[1]),
                 end='')
            step_loss = 0
        elif batch_id == 10:
            break
    test_acc = evaluate_accuracy(model, test_dataloader, ctx)
    print('\nTest Acc : {}'.format(test_acc))


Test Acc : 0.52686


KeyboardInterrupt: 

In [19]:
out


[[ 0.40061206  0.35546818]
 [ 0.32571524  0.2718605 ]
 [ 0.13055156  0.6908209 ]
 [ 0.11203045  0.41319743]
 [ 0.135449    0.34487763]
 [ 0.25897866  1.0362484 ]
 [ 0.45636153  0.18079619]
 [ 0.2402434   0.4008932 ]
 [ 0.12857051  0.08971198]
 [-0.21496539  0.14349057]
 [ 0.3352273   0.22073273]
 [ 0.05570364  0.7990796 ]
 [ 0.25062743 -0.07154827]
 [ 0.2273978   0.36972913]
 [ 0.15171419  0.14698373]
 [-0.02592476 -0.12936436]
 [-0.11573608  0.5158846 ]
 [ 0.21563885  0.4094187 ]
 [ 0.31336477 -0.14875774]
 [ 0.17630681  0.25964037]
 [ 0.3710956   0.30664656]
 [ 0.50876075  0.5201391 ]
 [ 0.06279844  0.4675638 ]
 [ 0.24566874  0.70804137]
 [-0.08556962  0.4800857 ]
 [ 0.3667487   0.6270006 ]
 [ 0.37454385 -0.11083919]
 [ 0.35946134  0.64396936]
 [-0.13175201  0.24146204]
 [ 0.27762577  0.26266006]
 [ 0.35406485  0.19143829]
 [ 0.29076728  0.00792346]
 [ 0.49147323  0.32852086]
 [-0.05769968 -0.14159451]
 [ 0.46907485  0.559992  ]
 [ 0.13862616 -0.05649821]
 [ 0.3221844   0.5495527 ]


In [32]:
acc = mx.metric.Accuracy()

In [34]:
test_dataloader

<mxnet.gluon.data.dataloader.DataLoader at 0x20b77531860>

# Start Smishing Detection

In [1]:
import numpy as np
import pandas as pd
from mxnet.gluon import nn, rnn
from mxnet import gluon, autograd
import gluonnlp as nlp
from mxnet import nd
import mxnet as mx
import time
import itertools
import random

In [2]:
path = '../dacon문자스미싱/filedown (2)'

train = pd.read_csv(path + '/train.csv')
test = pd.read_csv(path + '/public_test.csv')

In [3]:
train

Unnamed: 0,id,year_month,text,smishing
0,0,2017-01,XXX은행성산XXX팀장입니다.행복한주말되세요,0
1,1,2017-01,오늘도많이웃으시는하루시작하세요XXX은행 진월동VIP라운지 XXX올림,0
2,2,2017-01,안녕하십니까 고객님. XXX은행입니다.금일 납부하셔야 할 금액은 153600원 입니...,0
3,4,2017-01,XXX 고객님안녕하세요XXX은행 XXX지점입니다지난 한 해 동안 저희 XXX지점에 ...,0
4,5,2017-01,1월은 새로움이 가득XXX입니다.올 한해 더 많이행복한 한해되시길바랍니다,0
...,...,...,...,...
295940,336373,2018-12,XXX 고객님!열심히 달려왔던 2018년도 어느 새 뒤안길로 지나쳐가고 벅찬 설렘으...,0
295941,336375,2018-12,XXX고객님실버웰빙신탁이 만기도래 예정입니다.XXX남양주지점,0
295942,336376,2018-12,한해동안 XXX은행과 함께 해주셔서 정말 감사드립니다 2019년 기해년을 맞이하며 ...,0
295943,336377,2018-12,1228(금)예금및 적금 만기입니다.예금은 시간내서 내점하시고 적금은 1년 자동연장...,0


In [4]:
from sklearn.model_selection import train_test_split

train = train.set_index('id')
X_train, X_valid, y_train, y_valid = train_test_split(
                 train['text'], train['smishing'],
                 random_state=42, test_size=.2,
                 stratify=train['smishing'])

In [5]:
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((236756,), (59189,), (236756,), (59189,))

In [6]:
y_train.value_counts()

0    221794
1     14962
Name: smishing, dtype: int64

In [7]:
y_valid.value_counts()

0    55448
1     3741
Name: smishing, dtype: int64

In [8]:
pd.concat((X_train, y_train), axis=1).to_csv(
    'train_smishing.txt', sep='\t')
pd.concat((X_valid, y_valid), axis=1).to_csv(
    'valid_smishing.txt', sep='\t')

In [9]:
ctx = mx.gpu()
ctx

gpu(0)

In [10]:
bert_base, vocabulary = nlp.model.get_model('bert_12_768_12',
                 dataset_name='wiki_multilingual_cased',
                 pretrained=True, ctx=ctx, use_pooler=True,
                 use_decoder=False, use_classifier=False)

In [11]:
dataset_train = nlp.data.TSVDataset("./train_smishing.txt", field_indices=[1,2], num_discard_samples=1)
dataset_valid = nlp.data.TSVDataset("./valid_smishing.txt", field_indices=[1,2], num_discard_samples=1)

In [12]:
class BERTDataset(mx.gluon.data.Dataset):
    def __init__(self, dataset, sent_idx, label_idx, 
                 bert_tokenizer, max_len, pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, 
            pad=pad, pair=pair)
        sent_dataset = gluon.data.SimpleDataset(
            [[i[sent_idx],] for i in dataset]
        )
        self.sentences = sent_dataset.transform(transform)
        self.labels = gluon.data.SimpleDataset(
            [np.array(np.int32(i[label_idx])) for i in dataset])
    
    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))
    
    def __len__(self):
        return len(self.labels)

In [13]:
bert_tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=False)
max_len = 64

In [14]:
data_train = BERTDataset(dataset_train, 0, 1, bert_tokenizer, max_len, True, False)
data_valid = BERTDataset(dataset_valid, 0, 1, bert_tokenizer, max_len, True, False)

In [15]:
class BERTClassifier(nn.Block):
    def __init__(self,
                 bert,
                 num_classes=2,
                 dropout=None,
                 prefix=None,
                 params=None):
        super(BERTClassifier, self).__init__(prefix=prefix, params=params)
        self.bert = bert
        with self.name_scope():
            self.classifier = nn.HybridSequential(prefix=prefix)
            if dropout:
                self.classifier.add(nn.Dropout(rate=dropout))
            self.classifier.add(nn.Dense(units=num_classes))
            
    def forward(self, inputs, token_types, valid_length=None):
        _, pooler = self.bert(inputs, token_types, valid_length)
        return self.classifier(pooler)

In [None]:
from operator import itemgetter

@mx.metric.register
@mx.metric.alias('auc')
class AUCMetric(mx.metric.EvalMetric):
    def __init__(self, eps=1e-12):
        super(AUCMetric, self).__init__(
            'auc')
        self.eps = eps

    def update(self, labels, preds):
        mx.metric.check_label_shapes(labels, preds)
        label_weight = labels[0].asnumpy()
        preds = preds[0].asnumpy()
        tmp = []
        for i in range(preds.shape[0]):
            tmp.append((label_weight[i], preds[i]))
#         tmp = sorted(tmp, key=itemgetter(1), reverse=True)
        label_sum = label_weight.sum()
        if label_sum == 0 or label_sum == label_weight.size:
            raise Exception("AUC with one class is undefined")

        label_one_num = np.count_nonzero(label_weight)
        label_zero_num = len(label_weight) - label_one_num
        total_area = label_zero_num * label_one_num
        height = 0
        width = 0
        area = 0
        for a, _ in tmp:
            if a == 1.0:
                height += 1.0
            else:
                width += 1.0
                area += height

        self.sum_metric += area / total_area
        self.num_inst += 1

In [18]:
model = BERTClassifier(bert_base, num_classes=2, dropout=0.3)
# 분류 레이어만 초기화 한다. 
model.classifier.initialize(ctx=ctx)
model.hybridize()

# softmax cross entropy loss for classification
loss_function = gluon.loss.SoftmaxCELoss()

metric = mx.metric.Accuracy()
# metric = AUCMetric()

In [19]:
batch_size = 64
lr = 5e-5

train_dataloader = mx.gluon.data.DataLoader(data_train, batch_size=batch_size, num_workers=0)
valid_dataloader = mx.gluon.data.DataLoader(data_valid, batch_size=batch_size, num_workers=0)

In [20]:
trainer = gluon.Trainer(model.collect_params(), 'bertadam',
                        {'learning_rate': lr, 
                         'epsilon': 1e-9, 
                         'wd':0.01})

log_interval = 4
num_epochs = 4

In [21]:
# LayerNorm과 Bias에는 Weight Decay를 적용하지 않는다. 
for _, v in model.collect_params('.*beta|.*gamma|.*bias').items():
    v.wd_mult = 0.0
params = [
    p for p in model.collect_params().values() if p.grad_req != 'null'
]

In [22]:
# Learning rate warmup을 위한 준비 
step_size = batch_size 
num_train_examples = len(data_train)
num_train_steps = int(num_train_examples / step_size * num_epochs)
warmup_ratio = 0.1
num_warmup_steps = int(num_train_steps * warmup_ratio)
step_num = 0

In [23]:
def evaluate_accuracy(model, data_iter, ctx=ctx):
    acc = mx.metric.Accuracy()
    i = 0
    for i, (t,v,s, label) in enumerate(data_iter):
        token_ids = t.as_in_context(ctx)
        valid_length = v.as_in_context(ctx)
        segment_ids = s.as_in_context(ctx)
        label = label.as_in_context(ctx)
        output = model(token_ids, segment_ids, valid_length.astype('float32'))
        acc.update(preds=output, labels=label)
        if i > 1000:
            break
        i += 1
    return(acc.get()[1])

In [24]:
for epoch_id in range(num_epochs):
    metric.reset()
    step_loss = 0
    for batch_id, (token_ids, 
                   valid_length, 
                   segment_ids, 
                   label) in enumerate(train_dataloader):
        step_num += 1
        if step_num < num_warmup_steps:
            new_lr = lr * step_num / num_warmup_steps
        else:
            offset = (step_num - num_warmup_steps) * lr / (
                num_train_steps - num_warmup_steps)
            new_lr = lr - offset
        trainer.set_learning_rate(new_lr)
        with mx.autograd.record():
            # load data to GPU
            token_ids = token_ids.as_in_context(ctx)
            valid_length = valid_length.as_in_context(ctx)
            segment_ids = segment_ids.as_in_context(ctx)
            label = label.as_in_context(ctx)

            # forward computation
            out = model(token_ids, segment_ids, 
                        valid_length.astype('float32'))
            ls = loss_function(out, label).mean()

        # backward computation
        ls.backward()
        trainer.allreduce_grads()
        nlp.utils.clip_grad_global_norm(params, 1)
        trainer.update(token_ids.shape[0])

        step_loss += ls.asscalar()
        metric.update([label], [out])
        if (batch_id + 1) % (50) == 0:
            print('[Epoch {} Batch {}/{}] loss={:.4f}, lr={:.10f}, acc={:.3f}'
                         .format(epoch_id + 1, batch_id + 1, 
                                 len(train_dataloader),
                                 step_loss / log_interval,
                                 trainer.learning_rate, 
                                 metric.get()[1]))
            step_loss = 0
    valid_acc = evaluate_accuracy(model, valid_dataloader, ctx)
    print('\nValid acc : {}'.format(valid_acc))

[Epoch 1 Batch 50/3700] loss=4.1034, lr=0.0000016903, acc=0.873
[Epoch 1 Batch 100/3700] loss=1.0404, lr=0.0000033807, acc=0.919
[Epoch 1 Batch 150/3700] loss=0.6426, lr=0.0000050710, acc=0.941
[Epoch 1 Batch 200/3700] loss=0.4843, lr=0.0000067613, acc=0.953
[Epoch 1 Batch 250/3700] loss=0.4260, lr=0.0000084517, acc=0.960
[Epoch 1 Batch 300/3700] loss=0.2746, lr=0.0000101420, acc=0.966
[Epoch 1 Batch 350/3700] loss=0.4069, lr=0.0000118323, acc=0.970
[Epoch 1 Batch 400/3700] loss=0.2231, lr=0.0000135227, acc=0.973
[Epoch 1 Batch 450/3700] loss=0.1175, lr=0.0000152130, acc=0.976
[Epoch 1 Batch 500/3700] loss=0.2951, lr=0.0000169033, acc=0.978
[Epoch 1 Batch 550/3700] loss=0.2699, lr=0.0000185936, acc=0.979
[Epoch 1 Batch 600/3700] loss=0.2785, lr=0.0000202840, acc=0.981
[Epoch 1 Batch 650/3700] loss=0.2811, lr=0.0000219743, acc=0.982
[Epoch 1 Batch 700/3700] loss=0.1271, lr=0.0000236646, acc=0.983
[Epoch 1 Batch 750/3700] loss=0.1214, lr=0.0000253550, acc=0.984
[Epoch 1 Batch 800/3700] l

KeyboardInterrupt: 

# 전처리해서 다시 도전

In [1]:
import numpy as np
import pandas as pd
from mxnet.gluon import nn, rnn
from mxnet import gluon, autograd
import gluonnlp as nlp
from mxnet import nd
import mxnet as mx
import time
import itertools
import random

In [2]:
path = '../dacon문자스미싱/filedown (2)'

train = pd.read_csv(path + '/train.csv')
test = pd.read_csv(path + '/public_test.csv')

In [3]:
train = train.set_index('id')

In [4]:
train.text

id
0                                  XXX은행성산XXX팀장입니다.행복한주말되세요
1                     오늘도많이웃으시는하루시작하세요XXX은행 진월동VIP라운지 XXX올림
2         안녕하십니까 고객님. XXX은행입니다.금일 납부하셔야 할 금액은 153600원 입니...
4         XXX 고객님안녕하세요XXX은행 XXX지점입니다지난 한 해 동안 저희 XXX지점에 ...
5                  1월은 새로움이 가득XXX입니다.올 한해 더 많이행복한 한해되시길바랍니다
                                ...                        
336373    XXX 고객님!열심히 달려왔던 2018년도 어느 새 뒤안길로 지나쳐가고 벅찬 설렘으...
336375                    XXX고객님실버웰빙신탁이 만기도래 예정입니다.XXX남양주지점
336376    한해동안 XXX은행과 함께 해주셔서 정말 감사드립니다 2019년 기해년을 맞이하며 ...
336377    1228(금)예금및 적금 만기입니다.예금은 시간내서 내점하시고 적금은 1년 자동연장...
336378    안녕하세요 XXX 고객님. 스타링크 전담직원 XXX입니다. 스타링크 고객님 대상으로...
Name: text, Length: 295945, dtype: object

In [5]:
from eunjeon import Mecab

mecab = Mecab()

In [6]:
%time text = train.text.map(lambda x : mecab.morphs(x))

Wall time: 46.2 s


In [7]:
stopwords = ['XXX', '.', '을', '를', '이', '가', 
             '-', '(', ')', ':', '!', '?', ')-', 
             '.-', 'ㅡ', 'XXXXXX', '..', '.(', '은', '는'] #필요없는 단어 리스트
%time text = text.map(lambda x : [_word for _word in x if _word not in stopwords])

Wall time: 6.12 s


In [8]:
%time text = text.map(lambda x : ''.join(x))

Wall time: 871 ms


In [9]:
from chatspace import ChatSpace

spacer = ChatSpace()

Loading JIT Compiled ChatSpace Model


In [10]:
%time text = text.map(lambda x : spacer.space(x))

Wall time: 2h 33min 13s


In [12]:
import pickle
with open('spacing_text_except_stopwords.pkl', 'wb') as f:
    pickle.dump(text, f, protocol=pickle.HIGHEST_PROTOCOL)

In [88]:
train2 = pd.concat((text, train['smishing']), axis=1)

In [89]:
a, b = train2['smishing'].value_counts(normalize=True).values

In [90]:
a // b

14.0

In [91]:
import re
train2['text'] = train2.text.map(lambda x : re.sub('X', '', x))

In [92]:
normal = train2[train2['smishing'] == 0]
abnormal = train2[train2['smishing'] == 1]

In [93]:
ind = list(range(len(normal)))

In [94]:
np.random.shuffle(ind)

In [137]:
train_data = []
for i in range(14):
    train_data.append(
        normal.iloc[ind[i*len(normal)//14:(i+1)*len(normal)//14]]
    )

In [138]:
for i in range(14):
    train_data[i] = \
        pd.concat((train_data[i], abnormal)).sort_index()
    random_index = \
        np.random.permutation(len(normal) // 14 + len(abnormal))
    train_data[i] = train_data[i].iloc[random_index]

In [139]:
import sys
mod = sys.modules[__name__]
from sklearn.model_selection import train_test_split

for i in range(14):
    X_train, X_valid, y_train, y_valid = train_test_split(
                     train_data[i]['text'], 
                     train_data[i]['smishing'],
                     test_size=.2,
                     stratify=train_data[i]['smishing'])
    setattr(mod, 'X_train{}'.format(i+1), X_train)
    setattr(mod, 'X_valid{}'.format(i+1), X_valid)
    setattr(mod, 'y_train{}'.format(i+1), y_train)
    setattr(mod, 'y_valid{}'.format(i+1), y_valid)

In [141]:
for i in range(1, 15):
    pd.concat((getattr(mod, 'X_train{}'.format(i)),
               getattr(mod, 'y_train{}'.format(i))), 
              axis=1).to_csv(
        './smishing_text/train_smishing{}.txt'.format(i), 
                    sep='\t')
    pd.concat((getattr(mod, 'X_valid{}'.format(i)),
               getattr(mod, 'y_valid{}'.format(i))), 
              axis=1).to_csv(
        './smishing_text/valid_smishing{}.txt'.format(i), 
                    sep='\t')

In [142]:
ctx = mx.gpu()
ctx

gpu(0)

In [143]:
bert_base, vocabulary = nlp.model.get_model('bert_12_768_12',
                 dataset_name='wiki_multilingual_cased',
                 pretrained=True, ctx=ctx, use_pooler=True,
                 use_decoder=False, use_classifier=False)

In [145]:
for i in range(1, 15):
    setattr(mod, 'dataset_train{}'.format(i), 
            nlp.data.TSVDataset(
        "./smishing_text/train_smishing{}.txt".format(i), 
        field_indices=[1,2], num_discard_samples=1))
    setattr(mod, 'dataset_valid{}'.format(i),
            nlp.data.TSVDataset(
        "./smishing_text/valid_smishing{}.txt".format(i), 
        field_indices=[1,2], num_discard_samples=1))

In [146]:
class BERTDataset(mx.gluon.data.Dataset):
    def __init__(self, dataset, sent_idx, label_idx, 
                 bert_tokenizer, max_len, pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, 
            pad=pad, pair=pair)
        sent_dataset = gluon.data.SimpleDataset(
            [[i[sent_idx],] for i in dataset]
        )
        self.sentences = sent_dataset.transform(transform)
        self.labels = gluon.data.SimpleDataset(
            [np.array(np.int32(i[label_idx])) for i in dataset])
    
    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))
    
    def __len__(self):
        return len(self.labels)

In [147]:
bert_tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=False)
max_len = 64

In [148]:
for i in range(1, 15):
    setattr(mod, 'data_train{}'.format(i), 
            BERTDataset(
                getattr(mod, 'dataset_train{}'.format(i)), 
                0, 1, bert_tokenizer, max_len, True, False))
    setattr(mod, 'data_valid{}'.format(i),
           BERTDataset(
               getattr(mod, 'dataset_valid{}'.format(i)), 
               0, 1, bert_tokenizer, max_len, True, False))

In [149]:
class BERTClassifier(nn.Block):
    def __init__(self,
                 bert,
                 num_classes=2,
                 dropout=None,
                 prefix=None,
                 params=None):
        super(BERTClassifier, self).__init__(prefix=prefix, params=params)
        self.bert = bert
        with self.name_scope():
            self.classifier = nn.HybridSequential(prefix=prefix)
            if dropout:
                self.classifier.add(nn.Dropout(rate=dropout))
            self.classifier.add(nn.Dense(units=num_classes))
            
    def forward(self, inputs, token_types, valid_length=None):
        _, pooler = self.bert(inputs, token_types, valid_length)
        return self.classifier(pooler)

In [150]:
for i in range(1, 15):
    model_ = 'model{}'.format(i)
    setattr(mod, model_,
            BERTClassifier(bert_base, num_classes=2, dropout=0.3))

    # 분류 레이어만 초기화 한다. 
    getattr(mod, model_).classifier.initialize(ctx=ctx)
    getattr(mod, model_).hybridize()

# softmax cross entropy loss for classification
loss_function = gluon.loss.SoftmaxCELoss()

metric = mx.metric.Accuracy()

In [151]:
batch_size = 64
lr = 5e-5

for i in range(1, 15):
    setattr(mod, 'train_dataloader{}'.format(i),
            mx.gluon.data.DataLoader(
                getattr(mod, 'data_train{}'.format(i)), 
                batch_size=batch_size, num_workers=0))
    setattr(mod, 'valid_dataloader{}'.format(i),
            mx.gluon.data.DataLoader(
                getattr(mod, 'data_valid{}'.format(i)), 
                batch_size=batch_size, num_workers=0))

In [154]:
log_interval = 4
num_epochs = 5

for i in range(1, 15):
    model_ = getattr(mod, 'model{}'.format(i))
    setattr(mod, 'trainer{}'.format(i), gluon.Trainer(
        model_.collect_params(), 
        'bertadam',
        {'learning_rate': lr, 
         'epsilon': 1e-9, 
         'wd':0.01}))
    # LayerNorm과 Bias에는 Weight Decay를 적용하지 않는다. 
    iters = model_.collect_params('.*beta|.*gamma|.*bias').items()
    for _, v in iters:
        v.wd_mult = 0.0

In [161]:
for i in range(1, 15):
    model_ = getattr(mod, 'model{}'.format(i))
    setattr(mod, 'params{}'.format(i),
           [p for p in model_.collect_params().values() 
              if p.grad_req != 'null']) 

In [158]:
# Learning rate warmup을 위한 준비 
step_size = batch_size 
num_train_examples = len(data_train1)
num_train_steps = int(num_train_examples / step_size * num_epochs)
warmup_ratio = 0.1
num_warmup_steps = int(num_train_steps * warmup_ratio)
step_num = 0

In [159]:
def evaluate_accuracy(model, data_iter, ctx=ctx):
    acc = mx.metric.Accuracy()
    i = 0
    for i, (t,v,s, label) in enumerate(data_iter):
        token_ids = t.as_in_context(ctx)
        valid_length = v.as_in_context(ctx)
        segment_ids = s.as_in_context(ctx)
        label = label.as_in_context(ctx)
        output = model(token_ids, segment_ids, valid_length.astype('float32'))
        acc.update(preds=output, labels=label)
        if i > 1000:
            break
        i += 1
    return(acc.get()[1])

In [177]:
len(data_train1)

30804

In [163]:
for i in range(1, 15):
    print('{}번째 모델'.format(i))
    model = getattr(mod, 'model{}'.format(i))
    trainer = getattr(mod, 'trainer{}'.format(i))
    train_dataloader = getattr(mod, 'train_dataloader{}'.format(i))
    valid_dataloader = getattr(mod, 'valid_dataloader{}'.format(i))
    params = getattr(mod, 'params{}'.format(i))
    for epoch_id in range(num_epochs):
        metric.reset()
        step_loss = 0
        for batch_id, (token_ids, 
                       valid_length, 
                       segment_ids, 
                       label) in enumerate(train_dataloader):
            step_num += 1
            if step_num < num_warmup_steps:
                new_lr = lr * step_num / num_warmup_steps
            else:
                offset = (step_num - num_warmup_steps) * lr / (
                    num_train_steps - num_warmup_steps)
                new_lr = lr - offset
            trainer.set_learning_rate(new_lr)
            with mx.autograd.record():
                # load data to GPU
                token_ids = token_ids.as_in_context(ctx)
                valid_length = valid_length.as_in_context(ctx)
                segment_ids = segment_ids.as_in_context(ctx)
                label = label.as_in_context(ctx)

                # forward computation
                out = model(token_ids, segment_ids, 
                            valid_length.astype('float32'))
                ls = loss_function(out, label).mean()

            # backward computation
            ls.backward()
            trainer.allreduce_grads()
            nlp.utils.clip_grad_global_norm(params, 1)
            trainer.update(token_ids.shape[0])

            step_loss += ls.asscalar()
            metric.update([label], [out])
            if (batch_id + 1) % (50) == 0:
                print('[Epoch {} Batch {}/{}] loss={:.4f}, lr={:.10f}, acc={:.3f}'
                             .format(epoch_id + 1, batch_id + 1, 
                                     len(train_dataloader),
                                     step_loss / log_interval,
                                     trainer.learning_rate, 
                                     metric.get()[1]))
                step_loss = 0
        valid_acc = evaluate_accuracy(model, valid_dataloader, ctx)
        print('Valid acc : {}\n'.format(valid_acc))

1번째 모델
[Epoch 1 Batch 50/482] loss=2.8177, lr=0.0000116667, acc=0.917
[Epoch 1 Batch 100/482] loss=0.8137, lr=0.0000220833, acc=0.950
[Epoch 1 Batch 150/482] loss=1.0392, lr=0.0000325000, acc=0.959
[Epoch 1 Batch 200/482] loss=1.5004, lr=0.0000429167, acc=0.963
[Epoch 1 Batch 250/482] loss=2.6393, lr=0.0000496307, acc=0.962
[Epoch 1 Batch 300/482] loss=1.7526, lr=0.0000484765, acc=0.963
[Epoch 1 Batch 350/482] loss=2.0649, lr=0.0000473223, acc=0.962
[Epoch 1 Batch 400/482] loss=2.3171, lr=0.0000461681, acc=0.959
[Epoch 1 Batch 450/482] loss=1.3803, lr=0.0000450139, acc=0.961

Valid acc : 0.9848091404829914
[Epoch 2 Batch 50/482] loss=1.0960, lr=0.0000431210, acc=0.982
[Epoch 2 Batch 100/482] loss=1.4387, lr=0.0000419668, acc=0.974
[Epoch 2 Batch 150/482] loss=0.8660, lr=0.0000408126, acc=0.978
[Epoch 2 Batch 200/482] loss=0.4934, lr=0.0000396584, acc=0.981
[Epoch 2 Batch 250/482] loss=1.2836, lr=0.0000385042, acc=0.978
[Epoch 2 Batch 300/482] loss=1.5405, lr=0.0000373500, acc=0.976
[Ep

MXNetError: [14:01:22] c:\jenkins\workspace\mxnet-tag\mxnet\src\storage\./pooled_storage_manager.h:157: cudaMalloc failed: out of memory

In [172]:
ctx.empty_cache()

In [170]:
model.save_parameters('model1.params')

In [173]:
for i in range(2, 15):
    print('{}번째 모델'.format(i))
    model = getattr(mod, 'model{}'.format(i))
    trainer = getattr(mod, 'trainer{}'.format(i))
    train_dataloader = getattr(mod, 'train_dataloader{}'.format(i))
    valid_dataloader = getattr(mod, 'valid_dataloader{}'.format(i))
    params = getattr(mod, 'params{}'.format(i))
    for epoch_id in range(num_epochs):
        metric.reset()
        step_loss = 0
        for batch_id, (token_ids, 
                       valid_length, 
                       segment_ids, 
                       label) in enumerate(train_dataloader):
            step_num += 1
            if step_num < num_warmup_steps:
                new_lr = lr * step_num / num_warmup_steps
            else:
                offset = (step_num - num_warmup_steps) * lr / (
                    num_train_steps - num_warmup_steps)
                new_lr = lr - offset
            trainer.set_learning_rate(new_lr)
            with mx.autograd.record():
                # load data to GPU
                token_ids = token_ids.as_in_context(ctx)
                valid_length = valid_length.as_in_context(ctx)
                segment_ids = segment_ids.as_in_context(ctx)
                label = label.as_in_context(ctx)

                # forward computation
                out = model(token_ids, segment_ids, 
                            valid_length.astype('float32'))
                ls = loss_function(out, label).mean()

            # backward computation
            ls.backward()
            trainer.allreduce_grads()
            nlp.utils.clip_grad_global_norm(params, 1)
            trainer.update(token_ids.shape[0])

            step_loss += ls.asscalar()
            metric.update([label], [out])
            if (batch_id + 1) % (50) == 0:
                print('[Epoch {} Batch {}/{}] loss={:.4f}, lr={:.10f}, acc={:.3f}'
                             .format(epoch_id + 1, batch_id + 1, 
                                     len(train_dataloader),
                                     step_loss / log_interval,
                                     trainer.learning_rate, 
                                     metric.get()[1]))
                step_loss = 0
        valid_acc = evaluate_accuracy(model, valid_dataloader, ctx)
        print('Valid acc : {}\n'.format(valid_acc))

2번째 모델


MXNetError: [14:18:56] c:\jenkins\workspace\mxnet-tag\mxnet\src\operator\nn\../mxnet_op.h:805: Check failed: err == cudaSuccess (2 vs. 0) : Name: mxnet_generic_kernel_ex ErrStr:out of memory

In [175]:
new_net = BERTClassifier(bert_base, num_classes=2, dropout=0.3)
new_net.load_parameters('model1.params', ctx=ctx)

In [167]:
model1()

AttributeError: 'BERTClassifier' object has no attribute 'predict'