In [None]:
!pip install transformers --quiet
!pip install git+https://github.com/abheesht17/keras-nlp.git@gpt2-presets tensorflow tensorflow-text --upgrade --quiet

[K     |████████████████████████████████| 4.9 MB 5.0 MB/s 
[K     |████████████████████████████████| 120 kB 78.9 MB/s 
[K     |████████████████████████████████| 6.6 MB 81.0 MB/s 
[K     |████████████████████████████████| 578.0 MB 15 kB/s 
[K     |████████████████████████████████| 5.9 MB 75.1 MB/s 
[K     |████████████████████████████████| 1.7 MB 57.1 MB/s 
[K     |████████████████████████████████| 438 kB 81.8 MB/s 
[K     |████████████████████████████████| 5.9 MB 78.4 MB/s 
[?25h  Building wheel for keras-nlp (setup.py) ... [?25l[?25hdone


In [None]:
import json
import os

import keras_nlp
import numpy as np

import tensorflow as tf
from tensorflow import keras

import torch

from transformers import AutoTokenizer, AutoModel

tf.__version__

'2.10.0'

In [None]:
# Variables.

# Should be one of 124M, 355M, 774M, 1558M.
NUM_PARAMS = "1558M"

In [None]:
# Let's get the official OpenAI GPT-2 ckpt.

!wget https://raw.githubusercontent.com/openai/gpt-2/master/download_model.py
!python3 download_model.py $NUM_PARAMS

--2022-09-17 17:37:59--  https://raw.githubusercontent.com/openai/gpt-2/master/download_model.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1075 (1.0K) [text/plain]
Saving to: ‘download_model.py’


2022-09-17 17:37:59 (54.5 MB/s) - ‘download_model.py’ saved [1075/1075]

Fetching checkpoint: 1.00kit [00:00, 881kit/s]                                                      
Fetching encoder.json: 1.04Mit [00:00, 4.32Mit/s]                                                   
Fetching hparams.json: 1.00kit [00:00, 1.07Mit/s]                                                   
Fetching model.ckpt.data-00000-of-00001: 6.23Git [02:28, 41.9Mit/s]                                 
Fetching model.ckpt.index: 21.0kit [00:00, 822kit/s]                                    

In [None]:
# GPT-2 paths.
extract_dir = f"/content/models/{NUM_PARAMS}"
merges_path = os.path.join(extract_dir, "vocab.bpe")
vocab_path = os.path.join(extract_dir, "encoder.json")
checkpoint_path = os.path.join(extract_dir, "model.ckpt")
config_path = os.path.join(extract_dir, "hparams.json")

In [None]:
# n_vocab, n_ctx, n_embd, n_head, n_layer
with open(config_path, "r") as f:
    cfg = json.load(f)

cfg

{'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 1600, 'n_head': 25, 'n_layer': 48}

In [None]:
vars = tf.train.list_variables(checkpoint_path)
weights = {}
for name, shape in vars:
    print(name, shape)
    weight = tf.train.load_variable(checkpoint_path, name)
    weights[name] = weight

model/h0/attn/c_attn/b [4800]
model/h0/attn/c_attn/w [1, 1600, 4800]
model/h0/attn/c_proj/b [1600]
model/h0/attn/c_proj/w [1, 1600, 1600]
model/h0/ln_1/b [1600]
model/h0/ln_1/g [1600]
model/h0/ln_2/b [1600]
model/h0/ln_2/g [1600]
model/h0/mlp/c_fc/b [6400]
model/h0/mlp/c_fc/w [1, 1600, 6400]
model/h0/mlp/c_proj/b [1600]
model/h0/mlp/c_proj/w [1, 6400, 1600]
model/h1/attn/c_attn/b [4800]
model/h1/attn/c_attn/w [1, 1600, 4800]
model/h1/attn/c_proj/b [1600]
model/h1/attn/c_proj/w [1, 1600, 1600]
model/h1/ln_1/b [1600]
model/h1/ln_1/g [1600]
model/h1/ln_2/b [1600]
model/h1/ln_2/g [1600]
model/h1/mlp/c_fc/b [6400]
model/h1/mlp/c_fc/w [1, 1600, 6400]
model/h1/mlp/c_proj/b [1600]
model/h1/mlp/c_proj/w [1, 6400, 1600]
model/h10/attn/c_attn/b [4800]
model/h10/attn/c_attn/w [1, 1600, 4800]
model/h10/attn/c_proj/b [1600]
model/h10/attn/c_proj/w [1, 1600, 1600]
model/h10/ln_1/b [1600]
model/h10/ln_1/g [1600]
model/h10/ln_2/b [1600]
model/h10/ln_2/g [1600]
model/h10/mlp/c_fc/b [6400]
model/h10/mlp/

In [None]:
model = keras_nlp.models.GPT2.from_preset(
    "gpt2_extra_large",
    load_weights=False,
)

In [None]:
model.summary()

Model: "gpt2_custom"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 token_ids (InputLayer)         [(None, None)]       0           []                               
                                                                                                  
 token_embedding (Embedding)    (None, None, 1600)   80411200    ['token_ids[0][0]']              
                                                                                                  
 position_embedding (PositionEm  (None, None, 1600)  1638400     ['token_embedding[0][0]']        
 bedding)                                                                                         
                                                                                                  
 add (Add)                      (None, None, 1600)   0           ['token_embedding[0][0]

In [None]:
model.get_layer("token_embedding").embeddings.assign(
    weights["model/wte"]
)
model.get_layer("position_embedding").position_embeddings.assign(
    weights["model/wpe"]
)

range_1 = (0, cfg["n_embd"])
range_2 = (cfg["n_embd"], 2 * cfg["n_embd"]) 
range_3 = (2 * cfg["n_embd"], 3 * cfg["n_embd"])


for i in range(model.num_layers):
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._query_dense.kernel.assign(
        weights[f"model/h{i}/attn/c_attn/w"][0, :, range_1[0]: range_1[1]].reshape((cfg["n_embd"], cfg["n_head"], -1))
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._query_dense.bias.assign(
        weights[f"model/h{i}/attn/c_attn/b"][range_1[0]: range_1[1]].reshape((cfg["n_head"], -1))
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._key_dense.kernel.assign(
        weights[f"model/h{i}/attn/c_attn/w"][0, :, range_2[0]: range_2[1]].reshape((cfg["n_embd"], cfg["n_head"], -1))
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._key_dense.bias.assign(
        weights[f"model/h{i}/attn/c_attn/b"][range_2[0]: range_2[1]].reshape((cfg["n_head"], -1))
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._value_dense.kernel.assign(
        weights[f"model/h{i}/attn/c_attn/w"][0, :, range_3[0]: range_3[1]].reshape((cfg["n_embd"], cfg["n_head"], -1))
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._value_dense.bias.assign(
        weights[f"model/h{i}/attn/c_attn/b"][range_3[0]: range_3[1]].reshape((cfg["n_head"], -1))
    )


    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._output_dense.kernel.assign(
        weights[f"model/h{i}/attn/c_proj/w"][0].reshape((cfg["n_head"], -1, cfg["n_embd"]))
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._output_dense.bias.assign(
        weights[f"model/h{i}/attn/c_proj/b"]
    )


    model.get_layer(f"transformer_layer_{i}")._self_attention_layernorm.gamma.assign(
        weights[f"model/h{i}/ln_1/g"]
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layernorm.beta.assign(
        weights[f"model/h{i}/ln_1/b"]
    )


    model.get_layer(f"transformer_layer_{i}")._feedforward_intermediate_dense.kernel.assign(
        weights[f"model/h{i}/mlp/c_fc/w"][0]
    )
    model.get_layer(f"transformer_layer_{i}")._feedforward_intermediate_dense.bias.assign(
        weights[f"model/h{i}/mlp/c_fc/b"]
    )


    model.get_layer(f"transformer_layer_{i}")._feedforward_output_dense.kernel.assign(
        weights[f"model/h{i}/mlp/c_proj/w"][0]
    )
    model.get_layer(f"transformer_layer_{i}")._feedforward_output_dense.bias.assign(
        weights[f"model/h{i}/mlp/c_proj/b"]
    )


    model.get_layer(f"transformer_layer_{i}")._feedforward_layernorm.gamma.assign(
        weights[f"model/h{i}/ln_2/g"]
    )
    model.get_layer(f"transformer_layer_{i}")._feedforward_layernorm.beta.assign(
        weights[f"model/h{i}/ln_2/b"]
    )


model.get_layer("layer_norm").gamma.assign(
    weights[f"model/ln_f/g"]
)

model.get_layer("layer_norm").beta.assign(
    weights[f"model/ln_f/b"]
)
pass

In [None]:
# Define KerasNLP components.

gpt2_tokenizer = keras_nlp.models.GPT2Tokenizer(
    vocabulary=vocab_path,
    merges=merges_path,
    # sequence_length=cfg["n_ctx"]
)

# Define HF components.

hf_tokenizer = AutoTokenizer.from_pretrained("gpt2-xl")
hf_model = AutoModel.from_pretrained("gpt2-xl")
hf_model.eval()
pass

Downloading:   0%|          | 0.00/689 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

In [None]:
# KerasNLP
token_ids = gpt2_tokenizer(["the quick brown fox ran, galloped and jumped."])
padding_mask = token_ids != 0

inputs = {
    "token_ids": token_ids.to_tensor(),
    "padding_mask": padding_mask.to_tensor()
}

keras_nlp_output = model.predict(inputs)

# HF
hf_inputs = hf_tokenizer(["the quick brown fox ran, galloped and jumped."], return_tensors="pt")
hf_output = hf_model(**hf_inputs).last_hidden_state



In [None]:
keras_nlp_output[0, 0, :10], hf_output[0, 0, :10]

(array([-0.26750785,  0.22396186, -0.05913803, -0.12199795, -0.30013862,
        -0.12183367, -0.5269151 ,  0.68675554,  1.8155214 , -0.2948038 ],
       dtype=float32),
 tensor([-0.2675,  0.2240, -0.0591, -0.1220, -0.3001, -0.1218, -0.5269,  0.6868,
          1.8155, -0.2948], grad_fn=<SliceBackward0>))

In [None]:
np.mean(keras_nlp_output - hf_output.detach().numpy())

-1.2139675e-08

In [None]:
# Save GPT2 checkpoint.
model.save_weights(f"""gpt2_extra_large.h5""")

In [None]:
model2 = keras_nlp.models.GPT2.from_preset(
    "gpt2_extra_large",
    load_weights=False,
)
model2.load_weights(f"""gpt2_extra_large.h5""")

In [None]:
token_ids = gpt2_tokenizer(["the quick brown fox ran, galloped and jumped."])
padding_mask = token_ids != 0

inputs = {
    "token_ids": token_ids.to_tensor(),
    "padding_mask": padding_mask.to_tensor()
}

keras_nlp_output2 = model2.predict(inputs)



In [None]:
tf.reduce_mean(keras_nlp_output - keras_nlp_output2)

<tf.Tensor: shape=(), dtype=float32, numpy=0.0>

In [None]:
!md5sum gpt2_extra_large.h5

d093c1ee0d9705d845c0190909aa2917  gpt2_extra_large.h5


In [None]:
# Free up some RAM :)

del model
del model2
del hf_model

In [None]:
# Check uploaded model once added to repo
model_cloud = keras_nlp.models.GPT2.from_preset(
    "gpt2_extra_large",
    load_weights=True,
)

Downloading data from https://storage.googleapis.com/keras-nlp/models/gpt2_extra_large/model.h5


In [None]:
# Same output from cloud model
keras_nlp_output_cloud = model_cloud.predict(inputs)
tf.reduce_mean(keras_nlp_output - keras_nlp_output_cloud)



<tf.Tensor: shape=(), dtype=float32, numpy=0.0>

In [None]:
keras_nlp_output_cloud[0, 0, :10]

array([-0.26750785,  0.22396186, -0.05913803, -0.12199795, -0.30013862,
       -0.12183367, -0.5269151 ,  0.68675554,  1.8155214 , -0.2948038 ],
      dtype=float32)