In [8]:
import os
import re
import numpy as np
from tqdm import tqdm

import tensorflow as tf
from transformers import BertModel, TFBertModel
from transformers import BertTokenizer

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

import pandas as pd
import matplotlib.pyplot as plt

In [2]:
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string],"")
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

In [14]:
# random seed 고정
tf.random.set_seed(1234)
np.random.seed(1234)

BATCH_SIZE = 32
NUM_EPOCHS = 3
VALID_SPLIT = 0.2
MAX_LEN = 44 # EDA에서 추출된 max Lenght => EDA: 탐색적 데이터분석
DATA_IN_PATH = 'data_in/open/'
DATA_OUT_PATH = 'data_out'


In [10]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

In [16]:
train_data = pd.read_csv(DATA_IN_PATH + "train_data.csv")
test_data = pd.read_csv(DATA_IN_PATH + "test_data.csv")
submission = pd.read_csv(DATA_IN_PATH + "sample_submission.csv")

In [17]:
def bert_tokenizer(sent, MAX_LEN):
    
    encoded_dict = tokenizer.encode_plus(
        text = sent,
        add_special_tokens = True,  # Add [CLS] + [SEP]
        max_length = MAX_LEN,       # Pad & truncate all sentences(truncate: 길이를 줄이다.)
        pad_to_max_length = True,
        return_attention_mask = True #Construct attn masks 
    )
    
    input_id = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask'] # And its attention mask (simply differentiates padding from non-padding).
    token_type_id = encoded_dict['token_type_ids']
    
    return input_id, attention_mask, token_type_id

In [24]:
train_data.sample(5)

Unnamed: 0,index,title,topic_idx
25316,25316,중국 매체들 홍콩시위 옹호 영국 맹비난,4
21401,21401,리뷰 우리는 진정 해방됐나…뮤지컬 아리랑,3
12383,12383,브렉시트 수렁에 빠진 英정부 조기총선 놓고도 갑론을박,4
28685,28685,2분기도 수출 회복 어렵다…반도체마저 나빠질듯,1
28951,28951,인공지능에 얼굴인식까지…미래 스마트폰 갤럭시S8종합,0


In [31]:
test_data.sample(5)

Unnamed: 0,index,title
6818,52472,하반기 경제 중도금 대출 규제 분양시장 냉각 우려
5988,51642,전직 대법관 영장기각 항의
6993,52647,북한 핵실험 날 주민 2명 압록강 건너 탈북했다
6222,51876,카카오 7천544억 규모 유증…로엔엔터 지분 취득
1536,47190,경기 8개 시·군 호우주의보…장마 영향 호우특보 확대


In [30]:
submission.sample(5)

Unnamed: 0,index,topic_idx
3180,48834,0
3946,49600,0
918,46572,0
2997,48651,0
5862,51516,0


In [32]:
input_ids = []
attention_masks = []
token_type_ids = []
train_data_labels = []

for train_sent, train_label in tqdm(zip(train_data["title"], train_data['topic_idx']), total=len(train_data)):
    
    try:
        input_id, attention_mask, token_type_id = bert_tokenizer(train_sent, MAX_LEN)
    
        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        train_data_labels.append(train_label)
        
    except Exception as e:
        print(e)
        print(train_sent)
        pass


  0%|                                                                                        | 0/45654 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████████████████████████████████████████████████████████████████████| 45654/45654 [00:07<00:00, 5782.76it/s]


In [36]:
train_sent_input_ids = np.array(input_ids, dtype=int)
train_sent_attention_masks = np.array(attention_masks, dtype=int)
train_sent_type_ids = np.array(token_type_ids, dtype=int)
train_sent_inputs = (train_sent_input_ids, train_sent_attention_masks, train_sent_type_ids)

train_data_labels = np.asarray(train_data_labels, dtype=np.int32)


print("# sents: {}, # labels: {}".format(len(train_sent_input_ids), len(train_data_labels)))

# sents: 45654, # labels: 45654
