In [None]:
from google.colab import drive
drive.mount('/content/drive')

!pip install transformers              # hugging faces 활용
!pip install tqdm                      # 진행과정 보기

from tqdm import tqdm
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification

np.random.seed(43)
tf.random.set_seed(43)

# 한국어 Bert Tokenizer
klue_tokenizer = BertTokenizer.from_pretrained('klue/bert-base',
                                               cache_dir = 'bert-ckpt',
                                               do_lower_case = False)

# 학습 데이터
file_path = '/content/drive/MyDrive/NLP/ratings_train.txt'
train_data = pd.read_csv(file_path, sep = '\t', quoting = 3)
train_data.dropna(inplace=True)           #결측치 제거
train_data_counts = train_data.document.apply(lambda x: len(tokenizer.tokenize(x)))
print(np.quantile(train_data_counts, .75)) # 토큰 개수의 75% 해당 값

# 모델 학습
batch_size = 256
epochs = 1
valid_split = 0.2
max_len = 25

def klue_bert_tokenizer(sentence, max_len):

    encoded_dict = klue_tokenizer.encode_plus(
        text = sentence,
        max_length = max_len,
        pad_to_max_length = True,
        return_attention_mask = True)

    input_id = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask']
    token_type_id = encoded_dict['token_type_ids']

    return input_id, attention_mask, token_type_id

#  학습 데이터 생성하기
input_ids = []
attention_masks = []
token_type_ids = []

for sentence in tqdm(train_data['document']):
    input_id, attention_mask, token_type_id = klue_bert_tokenizer(sentence, max_len)
    input_ids.append(input_id)
    attention_masks.append(attention_mask)
    token_type_ids.append(token_type_id)

# ndarray 변환
train_input_ids = np.array(input_ids, dtype = int)
train_attention_masks = np.array(attention_masks, dtype = int)
train_token_type_ids = np.array(token_type_ids, dtype = int)
train_labels = np.array(train_data.label.values, dtype = int)      # 레이블 처리

# 생성된 데이터 병합
train_inputs  = (train_input_ids, train_attention_masks, train_token_type_ids)

# 모델 객체 생성
model_name = 'klue/bert-base'
num_labels = 2
klue_cls_model = TFBertForSequenceClassification.from_pretrained(model_name,
                                                                 num_labels = num_labels,
                                                                 from_pt = True)
klue_cls_model.summary()                   # 모델의 개요

# 모델 컴파일
optimizer = tf.keras.optimizers.Adam(learning_rate = 3e-5)
klue_cls_model.compile(loss = 'sparse_categorical_crossentropy',
                       optimizer = optimizer,
                       metrics = ['accuracy'])

# 학습 진행
history = klue_cls_model.fit(
    train_inputs,
    train_labels,
    epochs = 1,
    batch_size = batch_size,
    validation_split = valid_split)
print(history.history)

# 모델 저장
save_dir = '/content/drive/MyDrive/NLP/tf_klue_bert_naver_movie'
cls_model.save_pretrained(save_dir)

# 모델 평가
# 평가용 데이터 전처리
file_path = '/content/drive/MyDrive/NLP/ratings_test.txt'
test_data = pd.read_csv(file_path, sep='\t', quoting = 3)
'''
결측치 제거
입력데이터 ndarray로 변환
데이터 병합
'''
# 평가
score = cls_model.evaluate(test_inputs, test_movie_labels, batch_size = 512)
print(score)

# 학습된 모델 부르기
save_dir = '/content/drive/MyDrive/NLP/tf_klue_bert_naver_movie'
loaded_klue_cls_model = TFBertForSequenceClassification.from_pretrained(save_dir)

# 불러온 모델 컴파일
loaded_klue_cls_model.compile(loss = 'sparse_categorical_crossentropy',
                              optimizer = optimizer,
                              metrics = ['accuracy'])
