In [9]:
import pandas as pd
import numpy as np

In [37]:
import tokenization
import tensorflow as tf
import tensorflow_hub as hub
from keras.utils import to_categorical
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [10]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

In [70]:
label = preprocessing.LabelEncoder()
y = label.fit_transform(train_data['Target'])
# y = to_categorical(y)
print(y[:5])

[4 4 4 4 6]


In [71]:
label.inverse_transform(y)

array(['neutral', 'neutral', 'neutral', ..., 'surprise', 'neutral', 'joy'],
      dtype=object)

In [13]:
m_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
bert_layer = hub.KerasLayer(m_url, trainable=True)

In [19]:
pip install bert-tensorflow

Collecting bert-tensorflow
  Downloading bert_tensorflow-1.0.4-py2.py3-none-any.whl (64 kB)
     ---------------------------------------- 64.4/64.4 kB ? eta 0:00:00
Installing collected packages: bert-tensorflow
Successfully installed bert-tensorflow-1.0.4
Note: you may need to restart the kernel to use updated packages.


In [27]:
from bert import tokenization

In [116]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
        
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len-len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
        
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [117]:
def build_model(bert_layer, max_len=512):
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")
    
    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    
    clf_output = sequence_output[:, 0, :]
    
    lay = tf.keras.layers.Dense(64, activation='relu')(clf_output)
    lay = tf.keras.layers.Dropout(0.2)(lay)
    lay = tf.keras.layers.Dense(32, activation='relu')(lay)
    lay = tf.keras.layers.Dropout(0.2)(lay)
    out = tf.keras.layers.Dense(7, activation='softmax')(lay)
    
    model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(tf.keras.optimizers.Adam(lr=2e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [118]:
pip install bert-tensorflow==1.0.1

Note: you may need to restart the kernel to use updated packages.


In [119]:
from bert import tokenization

In [120]:
## 아래 코드 문제 해결
import sys
from absl import flags
sys.argv=['preserve_unused_tokens=False']
flags.FLAGS(sys.argv)

['preserve_unused_tokens=False']

In [121]:
max_len = 52
train_input = bert_encode(train_data.Utterance.values, tokenizer, max_len=max_len)
test_input = bert_encode(test_data.Utterance.values, tokenizer, max_len=max_len)
train_labels = y

In [122]:
labels = label.classes_
print(labels)

['anger' 'disgust' 'fear' 'joy' 'neutral' 'sadness' 'surprise']


In [123]:
model = build_model(bert_layer, max_len=max_len)
model.summary()

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 52)]         0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 52)]         0           []                               
                                                                                                  
 segment_ids (InputLayer)       [(None, 52)]         0           []                               
                                                                                                  
 keras_layer (KerasLayer)       [(None, 768),        109482241   ['input_word_ids[0][0]',         
                                 (None, 52, 768)]                 'input_mask[0][0]',       

  super(Adam, self).__init__(name, **kwargs)


In [124]:
train_labels

array([4, 4, 4, ..., 6, 4, 3])

In [125]:
checkpoint = tf.keras.callbacks.ModelCheckpoint('model.h5', monitor='val_accuracy', save_best_only=True, verbose=1)
earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, verbose=1)

train_sh = model.fit(
    train_input, train_labels,
    validation_split=0.2,
    epochs=3,
    callbacks=[checkpoint, earlystopping],
    batch_size=70,
    verbose=1  
)

Epoch 1/3


ValueError: in user code:

    File "c:\Users\user\Anaconda3\envs\tensorflow\lib\site-packages\keras\engine\training.py", line 1051, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\user\Anaconda3\envs\tensorflow\lib\site-packages\keras\engine\training.py", line 1040, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\user\Anaconda3\envs\tensorflow\lib\site-packages\keras\engine\training.py", line 1030, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\user\Anaconda3\envs\tensorflow\lib\site-packages\keras\engine\training.py", line 890, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "c:\Users\user\Anaconda3\envs\tensorflow\lib\site-packages\keras\engine\training.py", line 948, in compute_loss
        return self.compiled_loss(
    File "c:\Users\user\Anaconda3\envs\tensorflow\lib\site-packages\keras\engine\compile_utils.py", line 201, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "c:\Users\user\Anaconda3\envs\tensorflow\lib\site-packages\keras\losses.py", line 139, in __call__
        losses = call_fn(y_true, y_pred)
    File "c:\Users\user\Anaconda3\envs\tensorflow\lib\site-packages\keras\losses.py", line 243, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "c:\Users\user\Anaconda3\envs\tensorflow\lib\site-packages\keras\losses.py", line 1787, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "c:\Users\user\Anaconda3\envs\tensorflow\lib\site-packages\keras\backend.py", line 5119, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None, 1) and (None, 7) are incompatible


In [115]:
y_pred = np.argmax(model.predict(test_input), axis = -1)
y_pred



array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [102]:
from sklearn.metrics import f1_score

In [107]:
pred = np.array(labels)[y_pred]

In [108]:
pred

array(['anger', 'anger', 'anger', ..., 'anger', 'anger', 'anger'],
      dtype=object)

In [109]:
submission = pd.read_csv('data/sample_submission.csv')

In [110]:
submission

Unnamed: 0,ID,Target
0,TEST_0000,NAN
1,TEST_0001,NAN
2,TEST_0002,NAN
3,TEST_0003,NAN
4,TEST_0004,NAN
...,...,...
2605,TEST_2605,NAN
2606,TEST_2606,NAN
2607,TEST_2607,NAN
2608,TEST_2608,NAN


In [111]:
submission['Target'] = pred

In [112]:
submission

Unnamed: 0,ID,Target
0,TEST_0000,anger
1,TEST_0001,anger
2,TEST_0002,anger
3,TEST_0003,anger
4,TEST_0004,anger
...,...,...
2605,TEST_2605,anger
2606,TEST_2606,anger
2607,TEST_2607,anger
2608,TEST_2608,anger


In [113]:
submission.to_csv('submit.csv', index=False)