# load package and model

In [24]:
%load_ext autoreload
%autoreload 2
import os
import sys
import logging
# sys.path

from ai_schema.eval import eval_text_span_classify
from config_ai.backend import set_tf_config
from config_ai.utils import *
from config_ai.models.mlm import TFMLMModel, get_mlm_output, eval_mlm
from config_ai.data_utils import *
from config_ai.models import load_model
from config_ai.experiments import get_model_config
from config_ai.schema import MaskedLanguageModelExample

logging.basicConfig(level=logging.DEBUG, format="%(asctime)s [%(levelname)s][%(filename)s:%(lineno)d]:%(message)s",
                    datefmt='%Y-%m-%d %H:%M:%S')
os.environ["CUDA_VISIBLE_DEVICES"] = "0"


set_tf_config()


2021-10-27 11:44:18 [INFO][backend.py:57]:setting tensorflow config...
2021-10-27 11:44:18 [INFO][backend.py:61]:current devices:
2021-10-27 11:44:18 [INFO][backend.py:62]:cpus:[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]
2021-10-27 11:44:18 [INFO][backend.py:63]:gpus:[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
2021-10-27 11:44:18 [INFO][backend.py:64]:setting gpu memory allow growth...
2021-10-27 11:44:18 [INFO][backend.py:67]:setting soft device placement...


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [25]:
config_path = "/nfs/pony/chenhao/workspace/mlm_cls/mlm.ini"
# config_path = "/nfs/pony/chenhao/workspace/ontonotes_ner/tf_seq_labeling_config.ini"


config = read_config(config_path)
model_config = get_model_config(config)
model_config

model = TFMLMModel(config=model_config)


2021-10-27 11:44:18 [INFO][utils.py:287]:parsing config with path:/nfs/pony/chenhao/workspace/mlm_cls/mlm.ini


{'tokenizer_config': {'tokenizer_name': 'bert_word_piece',
  'tokenizer_args': {'vocabs': '/nfs/pony/chenhao/pretrain/chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt'}},
 'task_config': {'max_len': 256,
  'mask_percent': 0.15,
  'keep_token_path': '/nfs/pony/chenhao/data/sentiment/mlm/keep_tokens_rbw.txt'},
 'model_name': 'sentiment_mlm'}

2021-10-27 11:44:18 [INFO][core.py:41]:init model with config:
2021-10-27 11:44:18 [INFO][core.py:44]:{
    "tokenizer_config": {
        "tokenizer_name": "bert_word_piece",
        "tokenizer_args": {
            "vocabs": "/nfs/pony/chenhao/pretrain/chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt"
        }
    },
    "task_config": {
        "max_len": 256,
        "mask_percent": 0.15,
        "keep_token_path": "/nfs/pony/chenhao/data/sentiment/mlm/keep_tokens_rbw.txt"
    },
    "model_name": "sentiment_mlm",
    "model_cls": "TFMLMModel"
}
2021-10-27 11:44:18 [INFO][core.py:118]:initializing tokenizer with config:
{
    "tokenizer_name": "bert_word_piece",
    "tokenizer_args": {
        "vocabs": "/nfs/pony/chenhao/pretrain/chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt"
    }
}
2021-10-27 11:44:18 [INFO][core.py:126]:reinitializing tokenizer with keep_tokens
2021-10-27 11:44:18 [INFO][core.py:132]:tokenizer initialized with 6796 vocabs


# process data

In [18]:
train_data_path = config['data_config']['train_data_path']
train_data = model.jload_lines(train_data_path)
f"{train_data_path}: {len(train_data)} items"



dev_data_path = config['data_config']['dev_data_path']
dev_data = model.jload_lines(dev_data_path)
f"{dev_data_path}: {len(dev_data)} items"


test_data_path = config['data_config']['test_data_path']
test_data = model.jload_lines(test_data_path)
f"{test_data_path}: {len(test_data)} items"



train_data[0]

'/nfs/pony/chenhao/data/sentiment/mlm/sample.jsonl: 128 items'

'/nfs/pony/chenhao/data/sentiment/mlm/sample_m.jsonl: 128 items'

'/nfs/pony/chenhao/data/sentiment/mlm/sample_m.jsonl: 128 items'

MaskedLanguageModelExample(text='贝贝好爱干净 每天出门都要洗澡 还喜欢喝蒙牛 不喜欢蹲地方 喜欢坐凳子上还喜欢和我坐在一起~', extra_text=None, masked_tokens=None)

In [19]:
data_manager = DataManager.get_instance(model=model, data=train_data)

In [20]:
features = data_manager.get_features()
feature = next(features)

feature

{'full_text': '贝贝好爱干净 每天出门都要洗澡 还喜欢喝蒙牛 不喜欢蹲地方 喜欢坐凳子上还喜欢和我坐在一起~',
 'text': '贝贝好爱干净 每天出门都要洗澡 还喜欢喝蒙牛 不喜欢蹲地方 喜欢坐凳子上还喜欢和我坐在一起~',
 'extra_text': None,
 'token_ids': [101,
  6564,
  6564,
  1962,
  4263,
  2397,
  1112,
  3680,
  1921,
  1139,
  7305,
  6963,
  6206,
  3819,
  4074,
  6820,
  1599,
  3614,
  1600,
  5885,
  4281,
  679,
  1599,
  3614,
  6702,
  1765,
  3175,
  1599,
  3614,
  1777,
  1135,
  2094,
  677,
  6820,
  1599,
  3614,
  1469,
  2769,
  1777,
  1762,
  671,
  6629,
  172,
  102],
 'segment_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'tokens': ['[CLS]',
  '贝',
  '贝',
  '好',
  '爱',
  '干',
  '净',
  '每',
  '天',
  '出',
  '门',
  '都',
  '要',
  '洗',
  '澡',
  '还',
  '喜',
  '欢',
  '喝',
  '蒙',
  '牛',
  '不',
  '喜',
  '欢',
  '蹲',
  '地',
  '方',
  '喜',
  '欢',
  '坐',
  '凳',
  '子',
  '上',
  '还',
  '喜',
 

In [21]:
records = data_manager.get_records(mode="train")
record = next(records)
record



{'idx': 0,
 'full_text': '贝贝好爱干净 每天出门都要洗澡 还喜欢喝蒙牛 不喜欢蹲地方 喜欢坐凳子上还喜欢和我坐在一起~',
 'text': '贝贝好爱干净 每天出门都要洗澡 还喜欢喝蒙牛 不喜欢蹲地方 喜欢坐凳子上还喜欢和我坐在一起~',
 'extra_text': None,
 'token_ids': [101,
  6564,
  6564,
  1962,
  4263,
  103,
  1112,
  3680,
  1921,
  1139,
  7305,
  6963,
  6206,
  103,
  4074,
  6820,
  1599,
  3614,
  1600,
  5885,
  4281,
  679,
  1599,
  3614,
  6702,
  1765,
  3175,
  1599,
  103,
  1777,
  1135,
  2094,
  677,
  103,
  1599,
  3614,
  103,
  2769,
  1777,
  103,
  671,
  6629,
  172,
  102],
 'segment_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'tokens': ['[CLS]',
  '贝',
  '贝',
  '好',
  '爱',
  '[MASK]',
  '净',
  '每',
  '天',
  '出',
  '门',
  '都',
  '要',
  '[MASK]',
  '澡',
  '还',
  '喜',
  '欢',
  '喝',
  '蒙',
  '牛',
  '不',
  '喜',
  '欢',
  '蹲',
  '地',
  '方',
  '喜',
  '[MASK]',
  '坐',
  '凳',
  '子',
  

# build&compile model

In [26]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

nn_model_args = dict(**config["nn_model_config"])
nn_model_args
model.build_model(**nn_model_args)



{'pretrained_model_tag': 'bert',
 'pretrained_model_path': '/nfs/pony/chenhao/pretrain/chinese_roberta_wwm_ext_L-12_H-768_A-12'}

2021-10-27 11:44:26 [INFO][mirrored_strategy.py:500]:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
2021-10-27 11:44:26 [INFO][tf_core.py:127]:number of devices: 1, use SINGLE scope
2021-10-27 11:44:26 [INFO][nn_models.py:93]:loading from pretrained weights: /nfs/pony/chenhao/pretrain/chinese_roberta_wwm_ext_L-12_H-768_A-12/model.ckpt
2021-10-27 11:44:32 [INFO][tf_mlm.py:52]:nn model's summary:
2021-10-27 11:44:32 [INFO][layer_utils.py:192]:Model: "lml_model"
2021-10-27 11:44:32 [INFO][layer_utils.py:193]:__________________________________________________________________________________________________
2021-10-27 11:44:32 [INFO][layer_utils.py:190]:Layer (type)                    Output Shape         Param #     Connected to                     
2021-10-27 11:44:32 [INFO][layer_utils.py:190]:token_ids (InputLayer)          [(None, None)]       0                                            
2021-10-27 11:44:32 [INFO][layer_utils.py:259]:____________

<tensorflow.python.keras.engine.training.Model at 0x7f695024f4f0>

In [12]:
compile_args = dict(**config["compile_config"])
compile_args
model.compile_model(**compile_args)


{'optimizer_name': 'adam', 'optimizer_args': {'learning_rate': 3e-05}}

2021-10-27 11:42:51 [INFO][tf_mlm.py:59]:compiling model...
2021-10-27 11:42:51 [INFO][tf_core.py:127]:number of devices: 1, use SINGLE scope
2021-10-27 11:42:51 [INFO][losses.py:88]:build loss layer with loss function:<function sparse_categorical_crossentropy at 0x7f6aae1a60d0>
2021-10-27 11:42:52 [INFO][tf_mlm.py:78]:training model's summary:
2021-10-27 11:42:52 [INFO][layer_utils.py:192]:Model: "train_model"
2021-10-27 11:42:52 [INFO][layer_utils.py:193]:__________________________________________________________________________________________________
2021-10-27 11:42:52 [INFO][layer_utils.py:190]:Layer (type)                    Output Shape         Param #     Connected to                     
2021-10-27 11:42:52 [INFO][layer_utils.py:190]:token_ids (InputLayer)          [(None, None)]       0                                            
2021-10-27 11:42:52 [INFO][layer_utils.py:259]:__________________________________________________________________________________________________
2

In [13]:
model.train_model.inputs
model.train_model.outputs
model.train_model.losses
model.train_model.metrics


[<tf.Tensor 'token_ids:0' shape=(None, None) dtype=int32>,
 <tf.Tensor 'segment_ids:0' shape=(None, None) dtype=int32>,
 <tf.Tensor 'token_output:0' shape=(None, None) dtype=int32>]

[<tf.Tensor 'model/Identity:0' shape=(None, None, 21128) dtype=float32>]

[<tf.Tensor 'loss_layer/Identity:0' shape=() dtype=float32>]

[<tensorflow.python.keras.metrics.Mean at 0x7f6a0004d160>]

In [14]:
model.get_dataset_info("train")
model.get_dataset_info("test")

({'token_ids': 'int32', 'segment_ids': 'int32', 'token_output': 'int32'},
 {'token_ids': (None,), 'segment_ids': (None,), 'token_output': (None,)})

({'token_ids': 'int32', 'segment_ids': 'int32'},
 {'token_ids': (None,), 'segment_ids': (None,)})

# train_model

In [15]:
train_args = dict(**config["train_config"])
train_args.update(batch_size=16, epochs=5, steps_per_epoch=100, verbose=1)
train_args

model.train(train_data=train_data, **train_args)

{'epochs': 5,
 'batch_size': 16,
 'steps_per_epoch': 100,
 'overwrite_cache': False,
 'verbose': 1}

100%|██████████| 128/128 [00:00<00:00, 1342.44it/s]
2021-10-27 11:42:57 [INFO][tf_core.py:174]:train on 128 tensors


Epoch 1/5




KeyboardInterrupt: 

# predict&evaluate

In [27]:
model.vocab_size

6796

In [28]:
to_pred = dev_data[:1]
# to_pred = [MaskedLanguageModelExample(text='今天天气很好，心情非常愉[MASK]',
#                                       extra_text=None)]
# to_pred = [MaskedLanguageModelExample(text='科学[MASK][MASK]是第一生产力',
#                                       extra_text=None)]

to_pred


pred = model.predict(to_pred, show_detail=False)
pred[:4]


[MaskedLanguageModelExample(text='贝[MASK]好爱干净 每天出门[MASK]要洗[MASK] 还喜欢喝蒙牛 不[MASK]欢蹲地方 [MASK]欢坐凳子[MASK]还喜欢和我坐在一起~', extra_text=None, masked_tokens=['贝', '都', '澡', '喜', '喜', '上'])]

100%|██████████| 1/1 [00:00<00:00, 941.48it/s]
2021-10-27 11:44:42 [INFO][tf_core.py:205]:predicting with tf model...
2021-10-27 11:44:42 [INFO][utils.py:103]:function:_model_predict cost:0.190 seconds
2021-10-27 11:44:42 [INFO][tf_mlm.py:133]:(44, 6796)
2021-10-27 11:44:42 [INFO][tf_mlm.py:135]:[1589 3806 1158 1041 2552 1332  455 2183 1008  477 4189 4071 3626 2284
 3288 3974  828 2149  829 3459 2566  127  828 2149 3906  914 1880  828
 2149  924  473 1121   71 3974  828 2149  735 1589  924  912  119 3859
   71 1589]
2021-10-27 11:44:42 [INFO][tf_mlm.py:137]:['我', '贝', '宝', '好', '爱', '干', '净', '每', '天', '出', '门', '都', '要', '洗', '脸', '还', '喜', '欢', '喝', '蒙', '牛', '不', '喜', '欢', '蹲', '地', '方', '喜', '欢', '坐', '凳', '子', '~', '还', '喜', '欢', '和', '我', '坐', '在', '一', '起', '~', '我']
2021-10-27 11:44:42 [INFO][utils.py:103]:function:_post_predict cost:0.003 seconds


[['宝', '都', '脸', '喜', '喜', '~']]

In [29]:
output_data = get_mlm_output(to_pred, pred)
output_data[:4]

[{'text': '贝[MASK]好爱干净 每天出门[MASK]要洗[MASK] 还喜欢喝蒙牛 不[MASK]欢蹲地方 [MASK]欢坐凳子[MASK]还喜欢和我坐在一起~',
  'extra_text': None,
  'masked_tokens': ['贝', '都', '澡', '喜', '喜', '上'],
  'predict': ['宝', '都', '脸', '喜', '喜', '~'],
  'accuracy': 0.5}]

In [30]:
true_labels = [e.masked_tokens for e in to_pred]
eval_rs = eval_mlm(true_labels, pred)
eval_rs

{'item_num': 1,
 'token_num': 6,
 'accurate_token_num': 2,
 'accuracy': 0.3333333333333333}

# save&load

In [33]:
model.nn_model.summary()

Model: "lml_model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
token_ids (InputLayer)          [(None, None)]       0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, None)]       0                                            
__________________________________________________________________________________________________
model_2 (Model)                 (None, None, 6796)   91269004    token_ids[0][0]                  
                                                                 segment_ids[0][0]                
Total params: 91,269,004
Trainable params: 91,269,004
Non-trainable params: 0
__________________________________________________________________________________________________


In [34]:
model_path = f"./models/{model.model_name}"
save_args = dict(**config["common_config"]["save_args"])
if "tf_serving_version" in save_args:
    del save_args["tf_serving_version"]

save_args
model_path


model.save(path=model_path, **save_args)




{'format': 'h5'}

'./models/sentiment_mlm'

2021-10-27 17:38:04 [INFO][core.py:73]:saving model to ./models/sentiment_mlm
2021-10-27 17:38:04 [INFO][tf_core.py:56]:saving keras model to path:./models/sentiment_mlm/nn_model/nn_model.h5
2021-10-27 17:38:08 [INFO][core.py:138]:save model done


In [35]:
loaded_model = load_model(path=model_path)
loaded_model

2021-10-28 10:32:22 [INFO][core.py:84]:loading model from path:./models/sentiment_mlm
2021-10-28 10:32:22 [INFO][core.py:41]:init model with config:
2021-10-28 10:32:22 [INFO][core.py:44]:{
    "tokenizer_config": {
        "tokenizer_name": "bert_word_piece",
        "tokenizer_args": {
            "vocabs": [
                "[PAD]",
                "[UNK]",
                "[MASK]",
                "[CLS]",
                "[SEP]",
                "!",
                "\"",
                "#",
                "$",
                "%",
                "&",
                "'",
                "(",
                ")",
                "*",
                "+",
                ",",
                "-",
                ".",
                "/",
                "0",
                "1",
                "2",
                "3",
                "4",
                "5",
                "6",
                "7",
                "8",
                "9",
                ":",
              

2021-10-28 10:32:22 [INFO][core.py:118]:initializing tokenizer with config:
{
    "tokenizer_name": "bert_word_piece",
    "tokenizer_args": {
        "vocabs": [
            "[PAD]",
            "[UNK]",
            "[MASK]",
            "[CLS]",
            "[SEP]",
            "!",
            "\"",
            "#",
            "$",
            "%",
            "&",
            "'",
            "(",
            ")",
            "*",
            "+",
            ",",
            "-",
            ".",
            "/",
            "0",
            "1",
            "2",
            "3",
            "4",
            "5",
            "6",
            "7",
            "8",
            "9",
            ":",
            ";",
            "<",
            "=",
            ">",
            "?",
            "@",
            "[",
            "\\",
            "]",
            "^",
            "_",
            "a",
            "b",
            "c",
            "d",
            "e",
            "f"

2021-10-28 10:32:22 [INFO][core.py:126]:reinitializing tokenizer with keep_tokens
2021-10-28 10:32:22 [INFO][core.py:132]:tokenizer initialized with 6796 vocabs
2021-10-28 10:32:22 [INFO][tf_core.py:158]:loading keras model from path:./models/sentiment_mlm/nn_model/nn_model.h5 with format:h5
2021-10-28 10:32:33 [INFO][layer_utils.py:192]:Model: "lml_model"
2021-10-28 10:32:33 [INFO][layer_utils.py:193]:__________________________________________________________________________________________________
2021-10-28 10:32:33 [INFO][layer_utils.py:190]:Layer (type)                    Output Shape         Param #     Connected to                     
2021-10-28 10:32:33 [INFO][layer_utils.py:190]:token_ids (InputLayer)          [(None, None)]       0                                            
2021-10-28 10:32:33 [INFO][layer_utils.py:259]:__________________________________________________________________________________________________
2021-10-28 10:32:33 [INFO][layer_utils.py:190]:segment_id

<config_ai.models.mlm.tf_mlm.TFMLMModel at 0x7f6a7ca0c700>

In [41]:

h5_file = "./models/sentiment_mlm/nn_model/nn_model.h5"

pretrained_model = tensorflow.keras.models.load_model(filepath=h5_file, compile=False)
pretrained_model.summary()

Model: "lml_model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
token_ids (InputLayer)          [(None, None)]       0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, None)]       0                                            
__________________________________________________________________________________________________
model_2 (Model)                 (None, None, 6796)   91269004    token_ids[0][0]                  
                                                                 segment_ids[0][0]                
Total params: 91,269,004
Trainable params: 91,269,004
Non-trainable params: 0
__________________________________________________________________________________________________


In [73]:
loaded_pred = loaded_model.predict(data=to_pred)
loaded_pred

100%|██████████| 4/4 [00:00<00:00, 1418.91it/s]
2021-10-14 15:05:31 [INFO][tf_core.py:205]:predicting with tf model...
2021-10-14 15:05:31 [INFO][utils.py:103]:function:_model_predict cost:0.148 seconds
2021-10-14 15:05:31 [INFO][utils.py:103]:function:_post_predict cost:0.017 seconds


[['看', '看', '看', '大', '看', '看'],
 ['大', '日', '日', '事', '事', '以', '所', '日', '看', '日', '大'],
 ['看', '大', '看'],
 ['日', '所', '日', '此', '心', '心', '所', '此', '所', '采', '然', '大', '大', '看', '所']]