In [32]:
from absl import logging
logging.set_verbosity("INFO")
import tensorflow as tf
from tf_transformers.models import UNILMEncoder

In [33]:
import json
config  = json.load(open("../model_directory/bert_base/bert_config.json"))

config['embedding_size'] = 1024
config['vocab_size'] = 28996
config['num_hidden_layers'] = 24
config['type_vocab_size'] = 6
config['max_position_embeddings'] = 768
config['num_attention_heads'] = 16 # 1024/16 = 64
config['intermediate_size'] = 4096
config['layer_norm_epsilon'] = 1e-5


# UniLM
unilm_layer = UNILMEncoder(config=config,
                  name='unilm',
                  mask_mode='prefix',
                  is_training=False)

INFO:absl:We are overwriding `is_training` is False to `is_training` to True with `use_dropout` is False, no effects on your inference pipeline
INFO:absl:Inputs -->
INFO:absl:input_ids ---> Tensor("input_ids:0", shape=(None, None), dtype=int32)
INFO:absl:input_mask ---> Tensor("input_mask:0", shape=(None, None), dtype=int32)
INFO:absl:input_type_ids ---> Tensor("input_type_ids:0", shape=(None, None), dtype=int32)
INFO:absl:Initialized Variables


In [34]:
for var in unilm_layer.variables:
    print(var.name , '-->', var.shape)

tf_transformers/last_logits_bias:0 --> (28996,)
tf_transformers/unilm/word_embeddings/embeddings:0 --> (28996, 1024)
tf_transformers/unilm/type_embeddings/embeddings:0 --> (6, 1024)
tf_transformers/unilm/positional_embeddings/embeddings:0 --> (768, 1024)
tf_transformers/unilm/embeddings/layer_norm/gamma:0 --> (1024,)
tf_transformers/unilm/embeddings/layer_norm/beta:0 --> (1024,)
tf_transformers/unilm/transformer/layer_0/self_attention/query/kernel:0 --> (1024, 16, 64)
tf_transformers/unilm/transformer/layer_0/self_attention/query/bias:0 --> (16, 64)
tf_transformers/unilm/transformer/layer_0/self_attention/key/kernel:0 --> (1024, 16, 64)
tf_transformers/unilm/transformer/layer_0/self_attention/key/bias:0 --> (16, 64)
tf_transformers/unilm/transformer/layer_0/self_attention/value/kernel:0 --> (1024, 16, 64)
tf_transformers/unilm/transformer/layer_0/self_attention/value/bias:0 --> (16, 64)
tf_transformers/unilm/transformer/layer_0/self_attention_output/kernel:0 --> (1024, 1024)
tf_transfo

In [3]:
# Load CNNDM model from UNILM 

import torch
import numpy as np

model_recover_path = '../../../Projects/unilm/src/data/cnndm_model/cnndm_model.bin'
model_recover = torch.load(model_recover_path, map_location=torch.device('cpu'))

# Original is in float16, we need float32
model_recover_float32 = {k: item.numpy().astype(np.float32) for k, item in model_recover.items()}

# Save space
del model_recover

In [37]:
from_model_vars = ['bert.encoder.layer.{}.attention.self.query.weight',
'bert.encoder.layer.{}.attention.self.query.bias',
'bert.encoder.layer.{}.attention.self.key.weight',
'bert.encoder.layer.{}.attention.self.key.bias',
'bert.encoder.layer.{}.attention.self.value.weight',
'bert.encoder.layer.{}.attention.self.value.bias',
'bert.encoder.layer.{}.attention.output.dense.weight',
'bert.encoder.layer.{}.attention.output.dense.bias',
'bert.encoder.layer.{}.attention.output.LayerNorm.weight',
'bert.encoder.layer.{}.attention.output.LayerNorm.bias',
'bert.encoder.layer.{}.intermediate.dense.weight',
'bert.encoder.layer.{}.intermediate.dense.bias',
'bert.encoder.layer.{}.output.dense.weight',
'bert.encoder.layer.{}.output.dense.bias',
'bert.encoder.layer.{}.output.LayerNorm.weight',
'bert.encoder.layer.{}.output.LayerNorm.bias'
                  ]

to_model_vars = ['tf_transformers/unilm/transformer/layer_{}/self_attention/query/kernel:0',
'tf_transformers/unilm/transformer/layer_{}/self_attention/query/bias:0',
'tf_transformers/unilm/transformer/layer_{}/self_attention/key/kernel:0',
'tf_transformers/unilm/transformer/layer_{}/self_attention/key/bias:0',
'tf_transformers/unilm/transformer/layer_{}/self_attention/value/kernel:0',
'tf_transformers/unilm/transformer/layer_{}/self_attention/value/bias:0',
'tf_transformers/unilm/transformer/layer_{}/self_attention_output/kernel:0',
'tf_transformers/unilm/transformer/layer_{}/self_attention_output/bias:0',
'tf_transformers/unilm/transformer/layer_{}/self_attention_layer_norm/gamma:0',
'tf_transformers/unilm/transformer/layer_{}/self_attention_layer_norm/beta:0',
'tf_transformers/unilm/transformer/layer_{}/intermediate/kernel:0',
'tf_transformers/unilm/transformer/layer_{}/intermediate/bias:0',
'tf_transformers/unilm/transformer/layer_{}/output/kernel:0',
'tf_transformers/unilm/transformer/layer_{}/output/bias:0',
'tf_transformers/unilm/transformer/layer_{}/output_layer_norm/gamma:0',
'tf_transformers/unilm/transformer/layer_{}/output_layer_norm/beta:0']


assert(len(from_model_vars) == len(to_model_vars))
mapping_dict = {}

for index in range(len(from_model_vars)):
    for i in range(config['num_hidden_layers']):
        mapping_dict[from_model_vars[index].format(i)] = to_model_vars[index].format(i)
    
# Word Embeddings   
mapping_dict['bert.embeddings.word_embeddings.weight'] = 'tf_transformers/unilm/word_embeddings/embeddings:0'
# Positional Embedding
mapping_dict['bert.embeddings.position_embeddings.weight'] = 'tf_transformers/unilm/positional_embeddings/embeddings:0'
# Type Embeddings
mapping_dict['bert.embeddings.token_type_embeddings.weight'] = 'tf_transformers/unilm/type_embeddings/embeddings:0'
mapping_dict['bert.embeddings.LayerNorm.weight']  = 'tf_transformers/unilm/embeddings/layer_norm/gamma:0'
mapping_dict['bert.embeddings.LayerNorm.bias']   = 'tf_transformers/unilm/embeddings/layer_norm/beta:0'

# Pooler Layer
mapping_dict['bert.pooler.dense.weight'] = 'tf_transformers/unilm/pooler_transform/kernel:0'
mapping_dict['bert.pooler.dense.bias']   = 'tf_transformers/unilm/pooler_transform/bias:0'

#Extra layers
mapping_dict['cls.predictions.bias'] = 'tf_transformers/last_logits_bias:0'
mapping_dict['cls.predictions.transform.dense.weight'] = 'tf_transformers/unilm/mlm_layer/dense/kernel:0'
mapping_dict['cls.predictions.transform.dense.bias'] = 'tf_transformers/unilm/mlm_layer/dense/bias:0'
mapping_dict['cls.predictions.transform.LayerNorm.weight'] = 'tf_transformers/unilm/mlm_layer/layer_normalization/gamma:0'
mapping_dict['cls.predictions.transform.LayerNorm.bias'] = 'tf_transformers/unilm/mlm_layer/layer_normalization/beta:0'

# mapping_dict['cls.predictions.transform.dense.weight'] = 'tf_transformers/unilm/extra_layer/dense/kernel:0'
# mapping_dict['cls.predictions.transform.dense.bias'] = 'tf_transformers/unilm/extra_layer/dense/bias:0'
# mapping_dict['cls.predictions.transform.LayerNorm.weight'] = 'tf_transformers/unilm/extra_layer/layer_normalization/gamma:0'
# mapping_dict['cls.predictions.transform.LayerNorm.bias'] = 'tf_transformers/unilm/extra_layer/layer_normalization/beta:0'






In [38]:
tf_transformers_model_index_dict = {}
for index, var in enumerate(unilm_layer.variables):
    tf_transformers_model_index_dict[var.name] = index
    
# legacy_ai <-- HuggingFace
assigned_map = []
assigned_map_values = []
print(1000 * "*")
print("UNILM requires Transpose ")
for original_var, legacy_var in mapping_dict.items():

    index = tf_transformers_model_index_dict[legacy_var]        
    # Transpose UNILM weigths also (No idea why)
    if 'query/kernel:0' in legacy_var or 'key/kernel:0' in legacy_var or 'value/kernel:0' in legacy_var:
        # huggingface (2D) to tf_transformers (3D)
        unilm_layer.variables[index].assign(tf.reshape(model_recover_float32[original_var].transpose(), (1024, 16, 64)))
        assigned_map.append((original_var, legacy_var))
        assigned_map_values.append((tf.reduce_sum(model_recover_float32[original_var]).numpy(), tf.reduce_sum(unilm_layer.variables[index]).numpy()))
        continue
        
        
    if 'query/bias:0' in legacy_var or 'key/bias:0' in legacy_var or 'value/bias:0' in legacy_var:
        # huggingface (2D) to tf_transformers (3D)
        unilm_layer.variables[index].assign(tf.reshape(model_recover_float32[original_var], (16, 64)))
        assigned_map.append((original_var, legacy_var))
        assigned_map_values.append((tf.reduce_sum(model_recover_float32[original_var]).numpy(), tf.reduce_sum(unilm_layer.variables[index]).numpy()))
        continue
    # Transpose UniLM variables here
    if 'intermediate/kernel:0' in legacy_var or 'output/kernel:0' in legacy_var:
        # huggingface (2D) to tf_transformers (3D)
        unilm_layer.variables[index].assign(model_recover_float32[original_var].transpose())
        assigned_map.append((original_var, legacy_var))
        assigned_map_values.append((tf.reduce_sum(model_recover_float32[original_var]).numpy(), tf.reduce_sum(unilm_layer.variables[index]).numpy()))
        continue
    # Positional Embeddings
    if 'positional_embeddings' in legacy_var:
        unilm_layer.variables[index].assign(model_recover_float32[original_var])
        assigned_map.append((original_var, legacy_var))
        assigned_map_values.append((tf.reduce_sum(model_recover_float32[original_var]).numpy(), tf.reduce_sum(unilm_layer.variables[index]).numpy()))
        continue
     
    # Transpose here (No idea why)
    if 'mlm_layer/dense/kernel' in legacy_var:
        # huggingface (2D) to tf_transformers (3D)
        unilm_layer.variables[index].assign(model_recover_float32[original_var].transpose())
        assigned_map.append((original_var, legacy_var))
        assigned_map_values.append((tf.reduce_sum(model_recover_float32[original_var]).numpy(), tf.reduce_sum(unilm_layer.variables[index]).numpy()))
        continue
        
    unilm_layer.variables[index].assign(model_recover_float32[original_var])
    assigned_map.append((original_var, legacy_var))
    assigned_map_values.append((tf.reduce_sum(model_recover_float32[original_var]).numpy(), tf.reduce_sum(unilm_layer.variables[index]).numpy()))

    
logging.info("Done assigning variables weights")


****************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************

INFO:absl:Done assigning variables weights


In [39]:
input_ids = tf.constant([[  101, 17851,   117,  1699,   113, 13597,   114,  1109,  1497, 16810,
          2020,  1126,  4449,  1154,  1103,  5683,  1104,  1528,  7635,  1116,
          6945,  4573, 17600,  6744,  9031,  1115,  1119,  1108,  1136,  4484,
          1104,  1251,  1888,  8145,  1121,  1113,  2313,  1103,  4261,   119,
             2, 17851, 16810,   139, 10835,  5981,  1500, 13597,  1115,   107,
          1177,  1677,  1185,  6581,  1127,  1215,  1107,  1103,  5683,  4449,
           119,   107,     3,  1124,  1896,   117,   107,   138,  1825,  1150,
          1144,  1216,   170,  1888,  2993,  1106,  2411,  1660,  1122,  1106,
          1103, 17718,   119,   107,     4,  5981,   112,   188,  7640,  2812,
          3711,  1118,  1160,  6959,   117,  1528,  3828,   139,  2723,  1181,
          1105,  1497,  2123, 11492,   117,  1104,   170,  2765,  2179,  1888,
          4000,  1103,  5871,  8674,  1158,  1509,  3071,  1121,  1113,  2313,
          1528,  7635,  1116,  6945,  4573, 17600,  1112,  1122,  7573,  1154,
          1103,  1497, 14316,   119,  1398,  4214,  1113,  2313,  1127,  1841,
           119,     5,  2123, 11492,  1105,   139,  2723,  1181,  2103,  1115,
          1103,  1888,  1108,  6203,  1121,   170,  2179,  1120,  1103, 24069,
          1751,   119,     6,  1109,  1160,  5873,  1758,  1103,  3155,  1888,
           117,  1133,  1225,  1136,  2112,  1122,  1113,  1147, 12045,   119,
          1109,  5873,  1163,  1115,  1152,  2542,  1103,  1888,   117,   102,
           103]])

input_type_ids = tf.constant([[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
         4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
         4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
         4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
         4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
         4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
         4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
         4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5]])
    
input_mask = tf.ones_like(input_type_ids)[0][:-1]
input_mask = tf.expand_dims(tf.concat([input_mask, [0]], axis=0), 0)


inputs_summarisation = {'input_ids':  input_ids, 
          'input_mask': input_mask, 
          'input_type_ids': input_type_ids}

outputs = unilm_layer(inputs_summarisation)

print("Predicted ids", tf.argmax(outputs['last_token_logits'][0]))
# Reference is 1497

Predicted ids tf.Tensor(1497, shape=(), dtype=int64)


In [40]:
# Save the model

checkpoint_dir = '../model_directory/unilm_cnndm/'
ckpt    = tf.train.Checkpoint(model=unilm_layer)
manager = tf.train.CheckpointManager(ckpt, checkpoint_dir, max_to_keep=1)
save_path = manager.save()

with open("../model_directory/unilm_cnndm/unilm_config.json", "w") as f:
    json.dump(config,f,indent=2)