In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from nltk import FreqDist
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np
import random
import time
from tqdm import tqdm
import math
!pip install transformers
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import get_linear_schedule_with_warmup

Collecting transformers
  Downloading transformers-4.9.1-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 3.9 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 35.1 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 35.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 29.4 MB/s 
[?25hCollecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully u

In [None]:
 ! git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git 

Cloning into 'Mecab-ko-for-Google-Colab'...
remote: Enumerating objects: 91, done.[K
remote: Counting objects: 100% (91/91), done.[K
remote: Compressing objects: 100% (85/85), done.[K
remote: Total 91 (delta 43), reused 22 (delta 6), pack-reused 0[K
Unpacking objects: 100% (91/91), done.


In [None]:
 cd Mecab-ko-for-Google-Colab

/content/Mecab-ko-for-Google-Colab


In [None]:
 ! bash install_mecab-ko_on_colab190912.sh

Installing konlpy.....
Collecting konlpy
  Downloading konlpy-0.5.2-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 121 kB/s 
Collecting JPype1>=0.7.0
  Downloading JPype1-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (448 kB)
[K     |████████████████████████████████| 448 kB 46.4 MB/s 
Collecting beautifulsoup4==4.6.0
  Downloading beautifulsoup4-4.6.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.6 MB/s 
[?25hCollecting colorama
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Installing collected packages: JPype1, colorama, beautifulsoup4, konlpy
  Attempting uninstall: beautifulsoup4
    Found existing installation: beautifulsoup4 4.6.3
    Uninstalling beautifulsoup4-4.6.3:
      Successfully uninstalled beautifulsoup4-4.6.3
Successfully installed JPype1-1.3.0 beautifulsoup4-4.6.0 colorama-0.4.4 konlpy-0.5.2
Done
Installing mecab-0.996-ko-0.9.2.tar.gz.....
Downloading mecab-0.996-ko-0.9.2.tar.g

In [None]:
from konlpy.tag import Mecab

In [None]:
SEED = 2021

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
PATH = '/content/drive/MyDrive/DL/DACON-news/'
train = pd.read_csv(PATH+'train_data.csv').iloc[:,1:]
test = pd.read_csv(PATH+'test_data.csv').iloc[:,1:]
submission = pd.read_csv(PATH+'sample_submission.csv')
topic_dict = pd.read_csv(PATH+'topic_dict.csv')

# 전처리

In [None]:
sentences = train['title']
#sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]
labels = train['topic_idx'].values

tokenizer = Mecab()
tokenized_texts = [tokenizer.morphs(t) for t in sentences]

In [None]:
## vocab만들기


vocab = FreqDist(np.hstack(tokenized_texts))
print('전체 단어 수:', len(vocab))
vocab = dict(filter(lambda x: x[1]>=3,vocab.items()))
#vocab = vocab.most_common()
print('전체 vocab 수:', len(vocab))

#word_to_index = {word[0] : index + 1 for index, word in enumerate(vocab)}
word_to_index = dict()
word_to_index['[CLS]'] = 101
word_to_index['[SEP]'] = 102

for t in tokenized_texts:
    t.insert(0,'[CLS]')
    t.insert(len(t),'[SEP]')

index = 1
for word in vocab:
    if index==101:
        index = 103
    word_to_index[word] = index
    index+=1

전체 단어 수: 30903
전체 vocab 수: 14603


In [None]:
max_token_num = pd.Series([len(t) for t in tokenized_texts]).max()
max_token_num

26

In [None]:
## encoding
input_ids = []
for sent in tokenized_texts:
    sent_ids = []
    for token in sent:
        try:
            sent_ids.append(word_to_index[token])
        except:
            sent_ids.append(0)
    input_ids.append(sent_ids)

순서를 고려한 정수 encoding

In [None]:
## padding
MAX_LEN = max_token_num
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

## masking
attention_masks = []
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

In [None]:
train_inputs = torch.tensor(input_ids)
train_labels = torch.tensor(labels)
train_masks = torch.tensor(attention_masks)

In [None]:
# 배치 사이즈
batch_size = 32

# 파이토치의 DataLoader로 입력, 마스크, 라벨을 묶어 데이터 설정
# 학습시 배치 사이즈 만큼 데이터를 가져옴
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test data 전처리

In [None]:
sentences = test['title']
#sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]

tokenizer = Mecab()
tokenized_texts = [tokenizer.morphs(t) for t in sentences]

In [None]:
## encoding
input_ids = []
for sent in tokenized_texts:
    sent_ids = []
    for token in sent:
        try:
            sent_ids.append(word_to_index[token])
        except:
            sent_ids.append(0)
    input_ids.append(sent_ids)

In [None]:
## padding
MAX_LEN = max_token_num
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

## masking
attention_masks = []
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

In [None]:
test_inputs = torch.tensor(input_ids)
test_masks = torch.tensor(attention_masks)

In [None]:
batch_size = 32

test_data = TensorDataset(test_inputs, test_masks)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

# 전처리 (pre)

In [None]:
train.to_csv(PATH+'train.csv', index=False)
test.to_csv(PATH+'test.csv', index=False)

In [None]:
data = pd.read_csv(PATH+'text.txt', header=None)
data['tokenized_len'] = [len(tokenizer.morphs(t)) for t in data[0]]
data.describe()

Unnamed: 0,tokenized_len
count,54785.0
mean,10.964844
std,2.573774
min,1.0
25%,9.0
50%,11.0
75%,13.0
max,22.0


In [None]:
def tokenize_ko(text):
    return [i for i in tokenizer.morphs(text)]

text = Field(tokenize=tokenize_ko, sequential=True, use_vocab=True, lower=True, batch_first=True, fix_length=22)
label = Field(sequential=False, use_vocab=False, is_target=True)
data_fields = [('text',text), ('label',label)]

In [None]:
train, test = TabularDataset.splits(path=PATH, train='train.csv', test='test.csv',  
                                    format='csv', skip_header=True, fields=data_fields)

In [None]:
print(vars(train[1]))

{'text': ['실리콘밸리', '넘어서겠다', '…', '구글', '15조원', '들여', '美', '전역', '거점', '화'], 'label': '4'}


In [None]:
text.build_vocab(train, min_freq=2)
print(len(text.vocab))
print(list(text.vocab.stoi)[:20])

20843
['<unk>', '<pad>', '…', '에', '·', '종합', '서', '의', '로', '한', '2', '도', '명', '이', '대통령', '은', '3', '1', '美', '한국']


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device, torch.cuda.get_device_name(0))

cuda Tesla T4


In [None]:
BATCH_SIZE = 32
train_iter = BucketIterator(train, batch_size=BATCH_SIZE, sort_key=lambda x: len(x.text),
                            device=device, train=True, sort=True, sort_within_batch=True)
test_iter = Iterator(test, batch_size=BATCH_SIZE, 
                     device=device, train=False, shuffle=False, sort=False)

In [None]:
batch = next(iter(train_iter))
ex = batch.text
print(ex.shape)
ex[:5]

torch.Size([32, 22])


tensor([[ 2133,    31,  9375,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1],
        [  427,  5780,  3192,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1],
        [   55, 15318,   190,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1],
        [   55,  5276,  5879,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1],
        [ 3367,   996,   192,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1]], device='cuda:0')

# 모델링

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device, torch.cuda.get_device_name(0))

cuda Tesla T4


In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=7)
model.cuda()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=714314041.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [None]:
optimizer = optim.AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)
epochs = 10
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [None]:
def flat_accuracy(preds, labels):
    
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
import time
import datetime

In [None]:
model.zero_grad()

for epoch_i in range(0, epochs):

    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()

    total_loss = 0

    model.train()
        
    for step, batch in enumerate(train_dataloader):

        if step % 500 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels = batch
             
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)

        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)


        optimizer.step()
        scheduler.step()

        model.zero_grad()

    avg_train_loss = total_loss / len(train_dataloader)            

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))

Training...
  Batch   500  of  1,427.    Elapsed: 0:01:47.
  Batch 1,000  of  1,427.    Elapsed: 0:03:36.

  Average training loss: 0.32
  Training epcoh took: 0:05:09
Training...
  Batch   500  of  1,427.    Elapsed: 0:01:49.
  Batch 1,000  of  1,427.    Elapsed: 0:03:38.

  Average training loss: 0.24
  Training epcoh took: 0:05:11
Training...
  Batch   500  of  1,427.    Elapsed: 0:01:49.
  Batch 1,000  of  1,427.    Elapsed: 0:03:38.

  Average training loss: 0.18
  Training epcoh took: 0:05:11
Training...
  Batch   500  of  1,427.    Elapsed: 0:01:49.
  Batch 1,000  of  1,427.    Elapsed: 0:03:38.

  Average training loss: 0.14
  Training epcoh took: 0:05:11
Training...
  Batch   500  of  1,427.    Elapsed: 0:01:49.
  Batch 1,000  of  1,427.    Elapsed: 0:03:37.

  Average training loss: 0.11
  Training epcoh took: 0:05:10
Training...
  Batch   500  of  1,427.    Elapsed: 0:01:49.
  Batch 1,000  of  1,427.    Elapsed: 0:03:37.

  Average training loss: 0.08
  Training epcoh took: 

In [None]:
#시작 시간 설정
t0 = time.time()

# 평가모드로 변경
model.eval()

# 변수 초기화
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

test_labels = []

# 데이터로더에서 배치만큼 반복하여 가져옴
for step, batch in enumerate(test_dataloader):
    # 경과 정보 표시
    if step % 100 == 0 and not step == 0:
        elapsed = format_time(time.time() - t0)
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(test_dataloader), elapsed))

    # 배치를 GPU에 넣음
    batch = tuple(t.to(device) for t in batch)
    
    # 배치에서 데이터 추출
    #b_input_ids, b_input_mask, b_labels = batch
    b_input_ids, b_input_mask = batch
    
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)
    
    # 로스 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()
    #label_ids = b_labels.to('cpu').numpy()

    pred_flat = np.argmax(logits, axis=1).flatten()
    for label in pred_flat:
        test_labels.append(label)

print("")
#print("Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
print("Test took: {:}".format(format_time(time.time() - t0)))

  Batch   100  of    286.    Elapsed: 0:00:06.
  Batch   200  of    286.    Elapsed: 0:00:11.

Test took: 0:00:16


In [None]:
submission['topic_idx'] = test_labels

In [None]:
submission.to_csv(PATH+'0722_BERT_epoch10.csv', index=False)