In [1]:
import numpy as np 
import pandas as pd

!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

import tokenization
import tensorflow as tf
import tensorflow_hub as hub
from keras.utils import to_categorical
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

'wget'��(��) ���� �Ǵ� �ܺ� ����, ������ �� �ִ� ���α׷�, �Ǵ�
��ġ ������ �ƴմϴ�.


In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
data_info = pd.read_csv('data/data_info.csv', encoding = 'cp949')

In [3]:
label = preprocessing.LabelEncoder()
y = label.fit_transform(train['Target'])
y = to_categorical(y)

In [4]:
m_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
bert_layer = hub.KerasLayer(m_url, trainable=True)



In [5]:
from bert import tokenization

In [6]:
tf.gfile = tf.io.gfile

In [7]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

def bert_encode(texts, tokenizer, max_len=100):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
        
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len-len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
        
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [32]:
def build_model(bert_layer, max_len=100):
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")
    
    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    
    clf_output = sequence_output[:, 0, :]
    
    lay = tf.keras.layers.Dense(64, activation='relu')(clf_output)
    lay = tf.keras.layers.Dropout(0.4)(lay)
    lay = tf.keras.layers.Dense(32, activation='relu')(lay)
    lay = tf.keras.layers.Dropout(0.2)(lay)
    out = tf.keras.layers.Dense(7, activation='softmax')(lay)
    
    model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(tf.keras.optimizers.Adam(lr=2e-5), loss='categorical_crossentropy', metrics=['accuracy'])
        
    return model

In [33]:
train_data = train['Utterance']
test_data = train['Utterance']

In [34]:
max_len = 100
train_input = bert_encode(train_data, tokenizer, max_len=max_len)
test_input = bert_encode(test_data, tokenizer, max_len=max_len)
train_labels = y

In [35]:
labels = label.classes_
print(labels)

['anger' 'disgust' 'fear' 'joy' 'neutral' 'sadness' 'surprise']


In [36]:
model = build_model(bert_layer, max_len=max_len)
model.summary()



Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 100)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 100)]        0           []                               
                                                                                                  
 segment_ids (InputLayer)       [(None, 100)]        0           []                               
                                                                                                  
 keras_layer (KerasLayer)       [(None, 768),        109482241   ['input_word_ids[0][0]',         
                                 (None, 100, 768)]                'input_mask[0][0]',       

In [37]:
checkpoint = tf.keras.callbacks.ModelCheckpoint('model.h5', monitor='val_accuracy', save_best_only=True, verbose=1)
earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, verbose=1)

train_sh = model.fit(
    train_input, train_labels,
    validation_split=0.2,
    epochs=3,
    # callbacks=[checkpoint, earlystopping],
    batch_size=64,
    verbose=1 # 학습 진행 상황 확인
)

Epoch 1/3
  9/125 [=>............................] - ETA: 38:34 - loss: 2.0100 - accuracy: 0.2014

KeyboardInterrupt: 

In [27]:
y_pred = model.predict(test_input)
y_pred



array([[0.11254419, 0.03552848, 0.03801102, ..., 0.42099532, 0.07214607,
        0.13690576],
       [0.11254426, 0.0355286 , 0.03801114, ..., 0.4209949 , 0.07214621,
        0.13690566],
       [0.11254428, 0.03552861, 0.03801115, ..., 0.4209949 , 0.07214621,
        0.13690566],
       ...,
       [0.11254419, 0.0355285 , 0.03801105, ..., 0.42099515, 0.0721461 ,
        0.13690573],
       [0.11254422, 0.03552856, 0.03801109, ..., 0.42099515, 0.07214615,
        0.13690567],
       [0.1125442 , 0.03552853, 0.03801107, ..., 0.42099515, 0.07214611,
        0.13690573]], dtype=float32)

In [78]:
y_pred = np.argmax(model.predict(test_input), axis=-1)
y_pred
y_pred = np.array(label.classes_)[y_pred]



In [83]:
len(y_pred)

9989

In [82]:
submit = pd.read_csv('data/sample_submission.csv')
# submit['Target'] = y_pred
submit
# submit.to_csv('./submit_keras.csv', index=False)

Unnamed: 0,ID,Target
0,TEST_0000,NAN
1,TEST_0001,NAN
2,TEST_0002,NAN
3,TEST_0003,NAN
4,TEST_0004,NAN
...,...,...
2605,TEST_2605,NAN
2606,TEST_2606,NAN
2607,TEST_2607,NAN
2608,TEST_2608,NAN
