In [1]:
import os
import re
import numpy as np
from tqdm import tqdm

import tensorflow as tf
from transformers import BertModel, TFBertModel
from transformers import BertTokenizer

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

import pandas as pd
import matplotlib.pyplot as plt

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    
    # 텐서플로가 첫 번째 GPU만 사용하도록 제한
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
    except RuntimeError as e:
        # 프로그램 시작시에 접근 가능한 장치가 설정되어야만 합니다
        print(e)

In [3]:
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string],"")
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

In [4]:
# random seed 고정
tf.random.set_seed(1234)
np.random.seed(1234)

BATCH_SIZE = 64
NUM_EPOCHS = 3
VALID_SPLIT = 0.2
MAX_LEN = 44 # EDA에서 추출된 max Lenght => EDA: 탐색적 데이터분석
DATA_IN_PATH = 'data_in/open/'
DATA_OUT_PATH = 'data_out'


In [5]:
tokenizer = BertTokenizer.from_pretrained("klue/roberta-large")

In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased", chach_dir='bert-ckpt', do_lower_case=False)

In [6]:
train_data = pd.read_csv(DATA_IN_PATH + "train_data.csv")
test_data = pd.read_csv(DATA_IN_PATH + "test_data.csv")
submission = pd.read_csv(DATA_IN_PATH + "sample_submission.csv")

In [7]:
def bert_tokenizer(sent, MAX_LEN):
    
    encoded_dict = tokenizer.encode_plus(
        text = sent,
        add_special_tokens = True,  # Add [CLS] + [SEP]
        max_length = MAX_LEN,       # Pad & truncate all sentences(truncate: 길이를 줄이다.)
        pad_to_max_length = True,
        return_attention_mask = True #Construct attn masks 
    )
    
    input_id = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask'] # And its attention mask (simply differentiates padding from non-padding).
    token_type_id = encoded_dict['token_type_ids']
    
    return input_id, attention_mask, token_type_id

In [8]:
train_data.sample(5)

Unnamed: 0,index,title,topic_idx
38693,38693,영흥화력발전소 화물노동자 사망사고 진상 규명 촉구 기자회견,2
36674,36674,日축구스타 혼다 멕시코 파추카 입단회견서 깜짝 스페인어,5
11399,11399,코스피 상승 출발…2100선 회복 시도1보,1
15603,15603,美 이라크 그린존 피격 관련 시아파 민병대 의심,4
29418,29418,대한항공·현대건설 남녀 프로배구 전반기 A학점,5


In [9]:
test_data.sample(5)

Unnamed: 0,index,title
3893,49547,쩍 갈라지고 큰 바위 쿵…곳곳에 해빙기 안전사고 주의보
6809,52463,북중교역 거점 단둥서 대북제재 이완 조짐…교역 급증세
1910,47564,MB 수사팀 구속·불구속 방안 모두 보고…구속수사에 무게종합3보
2939,48593,홍영표 탄력근로제 단위기간 확대 정기국회서 논의종합2보
3357,49011,이스라엘 아프리카 차드와 47년 만에 외교관계 복원


In [10]:
submission.sample(5)

Unnamed: 0,index,topic_idx
2977,48631,0
802,46456,0
1893,47547,0
3752,49406,0
5503,51157,0


In [11]:
train_data['topic_idx'].value_counts(sort=True).sort_index()

0    4824
1    6222
2    7362
3    5933
4    7629
5    6933
6    6751
Name: topic_idx, dtype: int64

In [12]:
input_ids = []
attention_masks = []
token_type_ids = []
train_data_labels = []

for train_sent, train_label in tqdm(zip(train_data["title"], train_data['topic_idx']), total=len(train_data)):
    
    try:
        input_id, attention_mask, token_type_id = bert_tokenizer(train_sent, MAX_LEN)
    
        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        train_data_labels.append(train_label)
        
    except Exception as e:
        print(e)
        print(train_sent)
        pass


  0%|                                                                                        | 0/45654 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████████████████████████████████████████████████████████████████████| 45654/45654 [00:07<00:00, 6114.05it/s]


In [13]:
train_sent_input_ids = np.array(input_ids, dtype=int)
train_sent_attention_masks = np.array(attention_masks, dtype=int)
train_sent_type_ids = np.array(token_type_ids, dtype=int)
train_sent_inputs = (train_sent_input_ids, train_sent_attention_masks, train_sent_type_ids)

train_data_labels = np.asarray(train_data_labels, dtype=np.int32)


print("# sents: {}, # labels: {}".format(len(train_sent_input_ids), len(train_data_labels)))

# sents: 45654, # labels: 45654


In [14]:
class TFBertClassifier(tf.keras.Model):
    def __init__(self, model_name, dir_path, num_class):
        super(TFBertClassifier, self).__init__()
        
        self.bert = TFBertModel.from_pretrained(model_name, cache_dir=dir_path)
        self.dropout = tf.keras.layers.Dropout(self.bert.config.hidden_dropout_prob)
        self.classifier = tf.keras.layers.Dense(num_class,
                                                kernel_initializer=tf.keras.initializers.TruncatedNormal(self.bert.config.initializer_range),
                                                name='classifier')
        
    def call(self, inputs, attention_mask=None, token_type_ids=None, training=False):
        # outputs 값: #sequence_output, pooled_output, (hidden_states), (attention)
        outputs = self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output, training=training)
        logits = self.classifier(pooled_output)
        
        return logits


In [15]:
cls_model = TFBertClassifier(model_name='klue/roberta-large', dir_path='klue-bert-ckpt', num_class=7)

InternalError: Blas GEMM launch failed : a.shape=(15, 768), b.shape=(768, 768), m=15, n=768, k=768 [Op:MatMul]

In [None]:
optimizer = tf.keras.optimizers.Adam(3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
cls_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [None]:
model_name = 'tf2_bert_news_classification'

#overfitting을 막기 위한 earlystop 추가
earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=2e-5, patience=1)

# min_delta: the treshold that triggers the terminateion (acc should at least improve 0.0001)
# patience: no improvment epochs (patience =1은 1번 이상 개선이 없으면 종료.)

checkpoint_path = os.path.join(DATA_OUT_PATH, model_name, 'weights.h5')
checkpoint_dir = os.path.dirname(checkpoint_path)

#Create path if exists
if os.path.exists(checkpoint_dir):
    print("{} -- Folder already exists \n".format(checkpoint_dir))
else:
    os.makedirs(checkpoint_dir, exist_ok=True)
    print("{} -- Folder create complete \n".format(checkpoint_dir))

cp_callback = ModelCheckpoint(checkpoint_path, monitor='val_accuracy', verbose=1, save_bert_only=True, save_weights_only=True)
                            

In [None]:
history = cls_model.fit(train_sent_inputs, train_data_labels, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE,
                        validation_split = VALID_SPLIT, callbacks=[earlystop_callback, cp_callback])
history.history