In [1]:
!git clone https://github.com/hanearl/bert-for-tf2.git
!pip install -r bert-for-tf2/requirements.txt

fatal: destination path 'bert-for-tf2' already exists and is not an empty directory.
Collecting focal-loss
  Downloading https://files.pythonhosted.org/packages/2e/a8/2fcf3420d28754b7df2ddb0e06f44bcae66ad6c18a8dea12268c1d52f210/focal_loss-0.0.5-py3-none-any.whl
Installing collected packages: focal-loss
Successfully installed focal-loss-0.0.5


In [2]:
!nvidia-smi

Sat Jun  6 13:17:41 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.82       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P0    29W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [0]:
import os
import sys
import datetime
import pickle
import math
import json

sys.path.append('bert-for-tf2')
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras

import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
from bert.tokenization.bert_tokenization import FullTokenizer

from custom_metrics import MultiLabelAccuracy
from sentiments_data import SentimentsData


In [0]:
with open('./bert-for-tf2/config.json', 'r') as f:
    config = json.load(f)

In [0]:
drive_path = config["drive_path"]
train_name = config["train_name"]

project_path = os.path.join(drive_path, "bert_sentiment")
bert_model_path = os.path.join(project_path, "bert_model")
data_path = os.path.join(project_path, "data")
epoch_log_path = os.path.join(project_path, "epoch_logs", train_name)
epoch_model_path = os.path.join(project_path, "epoch_models", train_name)
tb_path = os.path.join(project_path, "logs", train_name)

In [0]:
if not os.path.isdir(epoch_log_path):
    os.mkdir(epoch_log_path)

if not os.path.isdir(epoch_model_path):
    os.mkdir(epoch_model_path)

if not os.path.isdir(tb_path):
    os.mkdir(tb_path)

In [7]:
model_name = config['model_name']
model_dir = bert.fetch_google_bert_model(model_name, ".model")
model_ckpt = os.path.join(bert_model_path, model_dir, "bert_model.ckpt")

# Tokenize
do_lower_case = not (model_name.find("cased") == 0 or model_name.find("multi_cased") == 0)
bert.bert_tokenization.validate_case_matches_checkpoint(do_lower_case, model_ckpt)
vocab_file = os.path.join(model_dir, "vocab.txt")
tokenizer = bert.bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

# bert ckpt path
bert_ckpt_dir = os.path.join(bert_model_path, "multi_cased_L-12_H-768_A-12")
bert_ckpt_file = os.path.join(bert_ckpt_dir, "bert_model.ckpt")
bert_config_file = os.path.join(bert_ckpt_dir, "bert_config.json")

Already  fetched:  multi_cased_L-12_H-768_A-12.zip
already unpacked at: .model/multi_cased_L-12_H-768_A-12


In [8]:
with open(os.path.join(data_path, "sentiments.pkl"), "rb") as f:
    data = pickle.load(f)
df = pd.read_csv(os.path.join(data_path, 'sentiments.csv'))

  interactivity=interactivity, compiler=compiler, result=result)


In [0]:
def flatten_layers(root_layer):
    if isinstance(root_layer, keras.layers.Layer):
        yield root_layer
    for layer in root_layer._layers:
        for sub_layer in flatten_layers(layer):
            yield sub_layer


def freeze_bert_layers(l_bert):
    """
    Freezes all but LayerNorm and adapter layers - see arXiv:1902.00751.
    """
    for layer in flatten_layers(l_bert):
        if layer.name in ["LayerNorm", "adapter-down", "adapter-up"]:
            layer.trainable = True
        elif len(layer._layers) == 0:
            layer.trainable = False
        l_bert.embeddings_layer.trainable = False


def create_learning_rate_scheduler(max_learn_rate=5e-5,
                                   end_learn_rate=1e-7,
                                   warmup_epoch_count=10,
                                   total_epoch_count=90):

    def lr_scheduler(epoch):
        if epoch < warmup_epoch_count:
            res = (max_learn_rate/warmup_epoch_count) * (epoch + 1)
        else:
            res = max_learn_rate*math.exp(math.log(end_learn_rate/max_learn_rate)*(epoch-warmup_epoch_count+1)/(total_epoch_count-warmup_epoch_count+1))
        return float(res)
    learning_rate_scheduler = tf.keras.callbacks.LearningRateScheduler(lr_scheduler, verbose=1)

    return learning_rate_scheduler

In [0]:
class MyCustomCallback(tf.keras.callbacks.Callback):
    def __init__(self):
        self.pred_sentences = [df.sentence[i] for i in range(10, 20)]
        self.pred_sentiments = [df.sentiments[i] for i in range(10, 20)]

    def on_epoch_end(self, epoch, logs=None):
    #def on_batch_end(self, epoch, logs=None):
        epoch = epoch + 1
        if epoch % config['save_model_period'] == 0:
            self.model.save_weights(os.path.join(epoch_model_path, 'sentiments.h5'), overwrite=True)
        pred_sentences = self.pred_sentences
        pred_tokens = map(tokenizer.tokenize, pred_sentences)
        pred_tokens = map(lambda tok: ["[CLS]"] + tok + ["[SEP]"], pred_tokens)
        pred_token_ids = list(map(tokenizer.convert_tokens_to_ids, pred_tokens))

        pred_token_ids = map(lambda tids: tids + [0] * (128 - len(tids)), pred_token_ids)
        pred_token_ids = np.array(list(pred_token_ids))

        res = self.model.predict(pred_token_ids)
        res = tf.sigmoid(res)
        res = tf.cast(res > 0.5, dtype=tf.int32).numpy()

        res_string = ''
        res_string += 'epoch: {}\n'.format(epoch)
        for text, label, sentiment in zip(pred_sentences, self.pred_sentiments, res):
            pred_sentiments = [data.code_to_senti[s-1] for s in sentiment * np.arange(1, 35) if s != 0]
            res_string += "text: {}\nlabels: {}\nres: {}\n\n".format(text, label, pred_sentiments)

        with open(os.path.join(epoch_log_path, 'epoch_res_{}.txt'.format(epoch)), 'w') as f:
            f.write(res_string)

In [0]:
log_dir = os.path.join(tb_path, datetime.datetime.now().strftime("%Y%m%d-%H%M%s"))
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir)

In [0]:
from focal_loss import BinaryFocalLoss

def create_model(max_seq_len, adapter_size=64):
    """Creates a classification model."""

    # adapter_size = 64  # see - arXiv:1902.00751

    # create the bert layer
    with tf.io.gfile.GFile(bert_config_file, "r") as reader:
        bc = StockBertConfig.from_json_string(reader.read())
        bert_params = map_stock_config_to_params(bc)
        bert_params.adapter_size = adapter_size
        bert = BertModelLayer.from_params(bert_params, name="bert")

    input_ids = keras.layers.Input(shape=(max_seq_len,), dtype='int32', name="input_ids")
    # token_type_ids = keras.layers.Input(shape=(max_seq_len,), dtype='int32', name="token_type_ids")
    # output         = bert([input_ids, token_type_ids])
    output = bert(input_ids)

    print("bert shape", output.shape)
    cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(output)
    cls_out = keras.layers.Dropout(0.5)(cls_out)
    logits = keras.layers.Dense(units=768, activation="tanh")(cls_out)
    logits = keras.layers.Dropout(0.5)(logits)
    logits = keras.layers.Dense(units=34)(logits)

    # model = keras.Model(inputs=[input_ids, token_type_ids], outputs=logits)
    # model.build(input_shape=[(None, max_seq_len), (None, max_seq_len)])
    model = keras.Model(inputs=input_ids, outputs=logits)
    model.build(input_shape=(None, max_seq_len))

    # load the pre-trained model weights
    load_stock_weights(bert, bert_ckpt_file)

    # freeze weights if adapter-BERT is used
    if adapter_size is not None:
        freeze_bert_layers(bert)



    def sigmoid_cross_entropy_loss(true, pred):
        loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=pred, labels=true)
        loss = tf.reduce_mean(tf.reduce_sum(loss))
        return loss
    focal_loss = BinaryFocalLoss(gamma=1, from_logits=True)

    loss_func_list = {
        "sigmoid_cross_entropy_loss": sigmoid_cross_entropy_loss,
        "focal_loss": focal_loss
    }

    model.compile(optimizer=keras.optimizers.Adam(),
                  loss=loss_func_list[config['loss_func']],
                  metrics=[MultiLabelAccuracy()])

    model.summary()

    return model

In [0]:
adapter_size = None # use None to fine-tune all of BERT
model = create_model(data.max_seq_len, adapter_size=adapter_size)
total_epoch_count = config['num_epochs']

model.fit(x=data.train_x, y=data.train_y,
          validation_split=0.1,
          batch_size=config['batch_size'],
          shuffle=True,
          epochs=total_epoch_count,
          callbacks=[create_learning_rate_scheduler(max_learn_rate=1e-5,
                                                    end_learn_rate=1e-7,
                                                    warmup_epoch_count=config['warmup_epoch_count'],
                                                    total_epoch_count=total_epoch_count),
                     keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True),

                     tensorboard_callback, MyCustomCallback()])
model.save_weights(os.path.join(epoch_model_path, 'sentiments_fin.h5'), overwrite=True)


bert shape (None, 128, 768)
Done loading 196 BERT weights from: /content/drive/My Drive/bert_sentiment/bert_model/multi_cased_L-12_H-768_A-12/bert_model.ckpt into <bert.model.BertModelLayer object at 0x7fa7f7602978> (prefix:bert). Count of weights not found in the checkpoint was: [0]. Count of weights with mismatched shape: [0]
Unused weights from checkpoint: 
	bert/embeddings/token_type_embeddings
	bert/pooler/dense/bias
	bert/pooler/dense/kernel
	cls/predictions/output_bias
	cls/predictions/transform/LayerNorm/beta
	cls/predictions/transform/LayerNorm/gamma
	cls/predictions/transform/dense/bias
	cls/predictions/transform/dense/kernel
	cls/seq_relationship/output_bias
	cls/seq_relationship/output_weights
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 128)]             0         
_________________________________________________________________
bert (

In [0]:
adapter_size = None # use None to fine-tune all of BERT
model = create_model(data.max_seq_len, adapter_size=adapter_size)

model.load_weights(os.path.join(epoch_model_path, 'sentiments_fin.h5'))

In [0]:
pred_sentences = [df.sentence[i] for i in range(10230, 10270)]
# pred_sentiments = [df.sentiments[i] for i in range(100, 150)]
# pred_sentences = [
#                   "공감을 누르지말아달라하시니 눌러드리는게 인지상정!",
#                   "저도.. 아무리 장난이라지만 다른 사람의 마음을 가지고 장난칠 수 있는 이런걸 보고 웃는게 별로네요. 장난인데 제가 너무 심각하게 받아들인건지는 몰라도..",
#                   "선의가 아닌 오자랖. 자기차는 그러고 있으면 사고 안나나?",
#                   "손크기 차이 뭐야ㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠ뭐냐고ㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠ언니 너무 커여워요ㅠㅠㅠㅠㅠㅠㅠㅠ 언니 주먹으로 꿀밤맞고싶다ㅠㅠㅠㅠ 귀여워ㅠㅠㅠㅠㅠㅠ",
#                   "나라카일 !! 개리형은 다이아에 있을 실력이 아니야 카일이 형이랑 같이 올라가자!",
#                   "진짜 나라카일이랑 해도 이기기가 힘드네ㅋㅋ 킹텀ㅋㅋ 진짜ㅋㅋ",
#                   "몽자야.. 너 강아지 아니지 사람이잖아.. 그치..?",
#                   "몽자 댕청해ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ몸은 김종국인데 왤케 뽀짝하지",
#                   "몽자도 몽자인데 주인 부부님이 너무 유쾌하고 웃기셔 ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ ㅋㅋㅋㅋㅋ",
#                   "몽자는 귓털이 횟가닥 깻잎머리마냥 앞 얼굴에 사방팔방 뒤엉켜 붙을때 잴 귀여움",
#                   "몽자야ㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠ너무기여우어ㅠㅠㅜㅜㅜㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠ 고개 갸우뚱~?",
#                   "우리 몽자 너무너무 사랑해ㅐ",
#                   "몽자야ㅠㅠㅠㅠㅠ 머리 부스스한 거 왜 이렇게 귀여운 거야ㅠㅠㅠㅠ 승질내도 왜 귀엽기만 한 거야 ㅠㅠㅠㅠ",
#                   "아니얘는 평소에도 부스스해서 너무 웃겨 ㅋㅌㅌㅌㅋㅋㅌㅌㅌㅋㅋ",

# ]
pred_tokens = map(tokenizer.tokenize, pred_sentences)
pred_tokens = map(lambda tok: ["[CLS]"] + tok + ["[SEP]"], pred_tokens)
pred_token_ids = list(map(tokenizer.convert_tokens_to_ids, pred_tokens))

pred_token_ids = map(lambda tids: tids + [0] * (128 - len(tids)), pred_token_ids)
pred_token_ids = np.array(list(pred_token_ids))

res = model.predict(pred_token_ids)
# res = sig(res)
res = tf.sigmoid(res)


res = tf.cast(res > 0.5, dtype=tf.int32).numpy()
res_string = ''
for text, sentiment in zip(pred_sentences,res):
    pred_sentiments = [data.code_to_senti[s-1] for s in sentiment * np.arange(1, 35) if s != 0]
    # pred_sentiments.append(data.code_to_senti[max_sentiments[0]])
    # res_string += "text: {}\nres: {}\n\n".format(text, pred_sentiments)
    # x = [(idx, prop) for idx, prop in enumerate(sentiment)]
    # x = sorted(x, key=lambda x: x[1], reverse=True)
    # pred_sentiments = [(data.code_to_senti[s[0]], s[1].numpy()*100) for s in x][:3]
    res_string += "text: {}\nres: {}\n\n".format(text, pred_sentiments)

print(res_string)

In [0]:
d

In [0]:
def sig(output):
    output = tf.math.log(output / (1 - output))
    return output