In [3]:
!pip install transformers

You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [4]:
from transformers import *

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [5]:
def bert_tokenizer(sent, MAX_LEN):
    
    encoded_dict = tokenizer.encode_plus(
        text = sent,
        add_special_tokens = True, #[CLS],[SET]
        max_length = MAX_LEN,
        pad_to_max_length = True,
        return_attention_mask = True,
    )
    input_id = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask'] # 1 for attention, 0 for nothing(padding)
    token_type_id = encoded_dict['token_type_ids']
    
    return input_id, attention_mask, token_type_id

In [6]:
print(tokenizer.all_special_tokens)
print(tokenizer.all_special_ids)

['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']
[100, 102, 0, 101, 103]


In [7]:
kor_encode = tokenizer.encode("안녕하세요, 자연어처리")
eng_encode = tokenizer.encode("Hello, NLP")
kor_decode = tokenizer.decode(kor_encode)
eng_decode = tokenizer.decode(eng_encode)

print(kor_encode)
print(eng_encode)
print(kor_decode)
print(eng_decode)

[101, 9521, 118741, 35506, 24982, 48549, 117, 9651, 25486, 12965, 60469, 12692, 102]
[101, 31178, 117, 81130, 11127, 102]
[CLS] 안녕하세요, 자연어처리 [SEP]
[CLS] Hello, NLP [SEP]


In [8]:
with open('../input/naver-movie-review-dataset/ratings_train.txt') as f:
    train_text = f.read()
with open('../input/naver-movie-review-dataset/ratings_test.txt') as f:
    test_text = f.read()
    

In [9]:
import pandas as pd
import numpy as np

tr_list = []
te_list = []
for t in train_text.split('\n'):
    tr_list.append(t.split('\t'))
for t in test_text.split('\n'):
    te_list.append(t.split('\t'))
    

In [10]:
train_data=pd.DataFrame(tr_list[1:-1],columns=['id','document','label'])
test_data=pd.DataFrame(te_list[1:-1],columns=['id','document','label'])


In [11]:
train_data.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [12]:
import re

MAX_LEN = 39
input_ids = []
attention_masks = []
token_type_ids = []
train_data_labels = []

def clean_text(sent):
    sent_clean = re.sub("[^ 가-힣ㄱ-ㅎㅏ-ㅣ\\s]"," ",sent)
    return sent_clean

for train_sent, train_label in zip(train_data['document'],train_data['label']):
    try:
        input_id, attention_mask, token_type_id = bert_tokenizer(clean_text(train_sent),MAX_LEN)
        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        train_data_labels.append(train_label)
    except Exception as e:
        print(e)
        print(train_sent)
        pass

train_movie_input_ids = np.array(input_ids,dtype=int)
train_movie_attention_masks = np.array(attention_masks, dtype=int)
train_movie_type_ids = np.array(token_type_ids, dtype=int)
train_movie_inputs = (train_movie_input_ids, train_movie_attention_masks, train_movie_type_ids)
train_data_labels = np.asarray(train_data_labels, dtype=np.int32)
print("# sents: {}, # labels: {}".format(len(train_movie_input_ids),len(train_data_labels)))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


# sents: 150000, # labels: 150000


In [13]:
sample = 3
print(train_movie_input_ids[sample])
print(train_movie_attention_masks[sample])
print(train_movie_type_ids[sample])
print(train_data_labels[sample])
print(tokenizer.decode(train_movie_input_ids[sample]))

[   101   8907  12092  22333 110148  17196 118922   9451  33077  18108
   9659  22458  11018  39218   9926  34907   9678  16605    102      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0]
0
[CLS] 교도소 이야기구먼 솔직히 재미는 없다 평점 조정 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


In [14]:
tokenizer.encode('교도소 이야기')

[101, 8907, 12092, 22333, 110148, 102]

In [15]:
import tensorflow as tf

In [16]:
class TFBertClassifier(tf.keras.Model):
    def __init__(self, model_name, dir_path, num_class):
        super(TFBertClassifier, self).__init__()
        self.bert = TFBertModel.from_pretrained(model_name, cache_dir=dir_path)
        self.dropout = tf.keras.layers.Dropout(self.bert.config.hidden_dropout_prob)
        self.classifier = tf.keras.layers.Dense(num_class, kernel_initializer=tf.keras.initializers.TruncatedNormal(self.bert.config.initializer_range),name='classifier')
    
    def call(self, inputs, attention_mask=None, token_type_ids=None, training=False):
        outputs = self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)
        # outputs == (sequence_output, pooled_output, hidden_states, attentions)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output, training=training)
        logits = self.classifier(pooled_output)
        return logits

cls_model = TFBertClassifier(model_name='bert-base-multilingual-cased',dir_path='bert_ckpt',num_class=2)

Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [17]:
optimizer = tf.keras.optimizers.Adam(3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

cls_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [18]:
import os
model_name = 'bert_classifier'
earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy',patience=2,min_delta=0.0001)
checkpoint_path = os.path.join('./',model_name,'weights.h5')
checkpoint_dir = os.path.dirname(checkpoint_path)
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir,exist_ok=True)

cp_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_path,monitor='val_accuracy',verbose=1,save_best_only=True,save_weights_only=True)
    

In [19]:
history = cls_model.fit(train_movie_inputs, train_data_labels, epochs=2, batch_size=128, validation_split=0.1, callbacks=[cp_callback,earlystop_callback])

Epoch 1/2

KeyboardInterrupt: 