In [6]:
import json
import tensorflow as tf
from tf_transformers.models import GPT2Encoder

from absl import logging
logging.set_verbosity("INFO")

In [7]:



config  = json.load(open("../model_directory/gpt2_base/gpt2_config.json"))

model_layer = GPT2Encoder(config = config, 
                 name='gpt2',
                 mask_mode='causal', 
                 is_training=False)


INFO:absl:We are overwriding `is_training` is False to `is_training` to True with `use_dropout` is False, no effects on your inference pipeline
INFO:absl:Inputs -->
INFO:absl:input_ids ---> Tensor("input_ids:0", shape=(None, None), dtype=int32)
INFO:absl:Initialized Variables


In [8]:
config

{'attention_probs_dropout_prob': 0.1,
 'hidden_act': 'gelu',
 'intermediate_act': 'gelu',
 'hidden_dropout_prob': 0.1,
 'embedding_size': 768,
 'initializer_range': 0.02,
 'intermediate_size': 3072,
 'max_position_embeddings': 1024,
 'num_attention_heads': 12,
 'num_hidden_layers': 12,
 'type_vocab_size': 2,
 'vocab_size': 50257,
 'layer_norm_epsilon': 1e-05}

In [14]:
tf.__version__

'2.3.0-rc0'

In [9]:
# Load checkpointa from GPT2 TF1 model and assign it to the TF2 model and save it

ckpt = tf.train.load_checkpoint("/Users/PRVATE/gpt-2-master/models/117M/")
gpt2_vars = tf.train.list_variables("/Users/PRVATE/gpt-2-master/models/117M/")

mapping_dict = {
    'attn/c_attn/b' : 'self_attention/qkv/bias',
    'attn/c_attn/w' : 'self_attention/qkv/kernel',
    'attn/c_proj/b' : 'self_attention_output/bias',
    'attn/c_proj/w' : 'self_attention_output/kernel',
    'ln_1/b'        : 'ln_1/layer_norm/beta',
    'ln_1/g'        : 'ln_1/layer_norm/gamma',
    'ln_2/b'        : 'self_attention_layer_norm/beta',
    'ln_2/g'        : 'self_attention_layer_norm/gamma',
    'mlp/c_fc/b'    : 'intermediate/bias',
    'mlp/c_fc/w'    : 'intermediate/kernel',
    'mlp/c_proj/b'  : 'output/bias',
    'mlp/c_proj/w'  : 'output/kernel',
    'model/wpe'     : 'tf_transformers/gpt2/positional_embeddings/embeddings',
    'model/wte'     : 'tf_transformers/gpt2/word_embeddings/embeddings',
    'ln_f/b'        : 'tf_transformers/gpt2/ln_f/layer_norm/beta',
    'ln_f/g'        : 'tf_transformers/gpt2/ln_f/layer_norm/gamma'
    
}

def tf1_to_tf2(model):
    gpt2_var_index = {var.name.split(':')[0]:index for index,var in enumerate(model.variables)}

    for var in gpt2_vars:
        var_name  = var[0]
        var_shape = var[1]
        hidden_layer = var_name.split('/')[1]

        if hidden_layer.startswith('h'):
            layer_number = hidden_layer[1:]
            gpt2_layer_name = 'layer_{}'.format(layer_number)

            for original_var in mapping_dict:
                if original_var in var_name:
                    w = ckpt.get_tensor(var_name)
                    # In GPT2, most variables are starting with an extra dimension
                    # Squeeze it if it is
                    if tf.shape(w)[0] == 1:
                        w = tf.squeeze(w, 0)
                    value = mapping_dict[original_var]
                    value = 'tf_transformers/gpt2/{}/{}/{}'.format('transformer',gpt2_layer_name, value)
                    value_index = gpt2_var_index[value]
                    model.variables[value_index].assign(w)
        else:
            for original_var in ['model/wpe', 'model/wte', 'ln_f/b', 'ln_f/g']:
                if original_var in var_name:
                    w = ckpt.get_tensor(var_name)
                    # In GPT2, most variables are starting with an extra dimension
                    # Squeeze it if it is
                    if tf.shape(w)[0] == 1:
                        w = tf.squeeze(w, 0)
                    value = mapping_dict[original_var]
                    value_index = gpt2_var_index[value]
                    model.variables[value_index].assign(w)
    logging.info("GPT2 weights restored")
    
tf1_to_tf2(model_layer)

INFO:absl:GPT2 weights restored


In [16]:
import tensorflow as tf
input_ids = tf.constant([[   50,   620,   259, 48664, 12171,   283,   318,   530,   286,
          262, 18822]])
results = model_layer({'input_ids': input_ids})


# <tf.Tensor: shape=(1, 11, 50257), dtype=float32, numpy=
# array([[[ -32.175262,  -31.924515,  -34.43251 , ...,  -40.16725 ,
#           -39.505302,  -32.213375],
#         [ -64.75787 ,  -64.33038 ,  -67.42167 , ...,  -75.922905,
#           -72.15867 ,  -64.45026 ],
#         [ -67.90772 ,  -65.774635,  -71.97972 , ...,  -74.40051 ,
#           -72.69093 ,  -65.50628 ],
#         ...,
#         [ -77.36691 ,  -75.05343 ,  -80.40854 , ...,  -83.03256 ,
#           -83.34796 ,  -78.000595],
#         [-114.61692 , -111.259094, -117.29202 , ..., -114.363396,
#          -117.813736, -112.561356],
#         [-118.39931 , -118.12673 , -122.41326 , ..., -122.978035,
#          -125.60019 , -119.76474 ]]], dtype=float32)>


results['token_logits']

<tf.Tensor: shape=(1, 11, 50257), dtype=float32, numpy=
array([[[ -32.175262,  -31.924515,  -34.43251 , ...,  -40.16725 ,
          -39.505302,  -32.213375],
        [ -64.75787 ,  -64.33038 ,  -67.42167 , ...,  -75.922905,
          -72.15867 ,  -64.45026 ],
        [ -67.90772 ,  -65.774635,  -71.97972 , ...,  -74.40051 ,
          -72.69093 ,  -65.50628 ],
        ...,
        [ -77.36691 ,  -75.05343 ,  -80.40854 , ...,  -83.03256 ,
          -83.34796 ,  -78.000595],
        [-114.61692 , -111.259094, -117.29202 , ..., -114.363396,
         -117.813736, -112.561356],
        [-118.39931 , -118.12673 , -122.41326 , ..., -122.978035,
         -125.60019 , -119.76474 ]]], dtype=float32)>

In [17]:
# Save the model

checkpoint_dir = '../model_directory/gpt2_base'
ckpt    = tf.train.Checkpoint(model=model_layer)
manager = tf.train.CheckpointManager(ckpt, checkpoint_dir, max_to_keep=1)
save_path = manager.save()


with open('../model_directory/gpt2_base/gpt2_config.json', 'w') as f:
    json.dump(config, f, indent=2)

In [18]:
save_path

'../model_directory/gpt2_base/ckpt-1'