In [1]:
!pip install transformers --quiet
!pip install git+https://github.com/abheesht17/keras-nlp.git@gpt2-presets tensorflow tensorflow-text --upgrade --quiet

[K     |████████████████████████████████| 5.5 MB 5.3 MB/s 
[K     |████████████████████████████████| 7.6 MB 49.2 MB/s 
[K     |████████████████████████████████| 163 kB 65.2 MB/s 
[K     |████████████████████████████████| 578.0 MB 16 kB/s 
[K     |████████████████████████████████| 5.9 MB 50.3 MB/s 
[K     |████████████████████████████████| 1.7 MB 51.9 MB/s 
[K     |████████████████████████████████| 438 kB 73.1 MB/s 
[K     |████████████████████████████████| 5.9 MB 33.5 MB/s 
[?25h  Building wheel for keras-nlp (setup.py) ... [?25l[?25hdone


In [2]:
import sys
sys.path.append("./")

import json
import os

import keras_nlp
import numpy as np

import tensorflow as tf
from tensorflow import keras

import torch

from transformers import AutoTokenizer, AutoModel

tf.__version__

'2.10.0'

In [3]:
# Variables.

# Should be one of 124M, 355M, 774M, 1558M.
NUM_PARAMS = "124M"

In [4]:
# Let's get the official OpenAI GPT-2 ckpt.

!wget https://raw.githubusercontent.com/openai/gpt-2/master/download_model.py
!python3 download_model.py $NUM_PARAMS

--2022-11-13 08:50:21--  https://raw.githubusercontent.com/openai/gpt-2/master/download_model.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1075 (1.0K) [text/plain]
Saving to: ‘download_model.py’


2022-11-13 08:50:21 (51.6 MB/s) - ‘download_model.py’ saved [1075/1075]

Fetching checkpoint: 1.00kit [00:00, 823kit/s]                                                      
Fetching encoder.json: 1.04Mit [00:00, 3.10Mit/s]                                                   
Fetching hparams.json: 1.00kit [00:00, 647kit/s]                                                    
Fetching model.ckpt.data-00000-of-00001: 498Mit [00:19, 26.0Mit/s]                                  
Fetching model.ckpt.index: 6.00kit [00:00, 3.48Mit/s]                                   

In [5]:
# GPT-2 paths.
extract_dir = f"/content/models/{NUM_PARAMS}"
merges_path = os.path.join(extract_dir, "vocab.bpe")
vocab_path = os.path.join(extract_dir, "encoder.json")
checkpoint_path = os.path.join(extract_dir, "model.ckpt")
config_path = os.path.join(extract_dir, "hparams.json")

In [6]:
# n_vocab, n_ctx, n_embd, n_head, n_layer
with open(config_path, "r") as f:
    cfg = json.load(f)

cfg

{'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}

In [7]:
vars = tf.train.list_variables(checkpoint_path)
weights = {}
for name, shape in vars:
    print(name, shape)
    weight = tf.train.load_variable(checkpoint_path, name)
    weights[name] = weight

model/h0/attn/c_attn/b [2304]
model/h0/attn/c_attn/w [1, 768, 2304]
model/h0/attn/c_proj/b [768]
model/h0/attn/c_proj/w [1, 768, 768]
model/h0/ln_1/b [768]
model/h0/ln_1/g [768]
model/h0/ln_2/b [768]
model/h0/ln_2/g [768]
model/h0/mlp/c_fc/b [3072]
model/h0/mlp/c_fc/w [1, 768, 3072]
model/h0/mlp/c_proj/b [768]
model/h0/mlp/c_proj/w [1, 3072, 768]
model/h1/attn/c_attn/b [2304]
model/h1/attn/c_attn/w [1, 768, 2304]
model/h1/attn/c_proj/b [768]
model/h1/attn/c_proj/w [1, 768, 768]
model/h1/ln_1/b [768]
model/h1/ln_1/g [768]
model/h1/ln_2/b [768]
model/h1/ln_2/g [768]
model/h1/mlp/c_fc/b [3072]
model/h1/mlp/c_fc/w [1, 768, 3072]
model/h1/mlp/c_proj/b [768]
model/h1/mlp/c_proj/w [1, 3072, 768]
model/h10/attn/c_attn/b [2304]
model/h10/attn/c_attn/w [1, 768, 2304]
model/h10/attn/c_proj/b [768]
model/h10/attn/c_proj/w [1, 768, 768]
model/h10/ln_1/b [768]
model/h10/ln_1/g [768]
model/h10/ln_2/b [768]
model/h10/ln_2/g [768]
model/h10/mlp/c_fc/b [3072]
model/h10/mlp/c_fc/w [1, 768, 3072]
model/h1

In [8]:
model = keras_nlp.models.GPT2.from_preset(
    "gpt2_base",
    load_weights=False,
)

In [9]:
model.summary()

Model: "backbone"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 token_ids (InputLayer)         [(None, None)]       0           []                               
                                                                                                  
 token_embedding (Embedding)    (None, None, 768)    38597376    ['token_ids[0][0]']              
                                                                                                  
 position_embedding (PositionEm  (None, None, 768)   786432      ['token_embedding[0][0]']        
 bedding)                                                                                         
                                                                                                  
 add (Add)                      (None, None, 768)    0           ['token_embedding[0][0]', 

In [10]:
model.get_layer("token_embedding").embeddings.assign(
    weights["model/wte"]
)
model.get_layer("position_embedding").position_embeddings.assign(
    weights["model/wpe"]
)

range_1 = (0, cfg["n_embd"])
range_2 = (cfg["n_embd"], 2 * cfg["n_embd"]) 
range_3 = (2 * cfg["n_embd"], 3 * cfg["n_embd"])


for i in range(model.num_layers):
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._query_dense.kernel.assign(
        weights[f"model/h{i}/attn/c_attn/w"][0, :, range_1[0]: range_1[1]].reshape((cfg["n_embd"], cfg["n_head"], -1))
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._query_dense.bias.assign(
        weights[f"model/h{i}/attn/c_attn/b"][range_1[0]: range_1[1]].reshape((cfg["n_head"], -1))
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._key_dense.kernel.assign(
        weights[f"model/h{i}/attn/c_attn/w"][0, :, range_2[0]: range_2[1]].reshape((cfg["n_embd"], cfg["n_head"], -1))
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._key_dense.bias.assign(
        weights[f"model/h{i}/attn/c_attn/b"][range_2[0]: range_2[1]].reshape((cfg["n_head"], -1))
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._value_dense.kernel.assign(
        weights[f"model/h{i}/attn/c_attn/w"][0, :, range_3[0]: range_3[1]].reshape((cfg["n_embd"], cfg["n_head"], -1))
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._value_dense.bias.assign(
        weights[f"model/h{i}/attn/c_attn/b"][range_3[0]: range_3[1]].reshape((cfg["n_head"], -1))
    )


    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._output_dense.kernel.assign(
        weights[f"model/h{i}/attn/c_proj/w"][0].reshape((cfg["n_head"], -1, cfg["n_embd"]))
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._output_dense.bias.assign(
        weights[f"model/h{i}/attn/c_proj/b"]
    )


    model.get_layer(f"transformer_layer_{i}")._self_attention_layernorm.gamma.assign(
        weights[f"model/h{i}/ln_1/g"]
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layernorm.beta.assign(
        weights[f"model/h{i}/ln_1/b"]
    )


    model.get_layer(f"transformer_layer_{i}")._feedforward_intermediate_dense.kernel.assign(
        weights[f"model/h{i}/mlp/c_fc/w"][0]
    )
    model.get_layer(f"transformer_layer_{i}")._feedforward_intermediate_dense.bias.assign(
        weights[f"model/h{i}/mlp/c_fc/b"]
    )


    model.get_layer(f"transformer_layer_{i}")._feedforward_output_dense.kernel.assign(
        weights[f"model/h{i}/mlp/c_proj/w"][0]
    )
    model.get_layer(f"transformer_layer_{i}")._feedforward_output_dense.bias.assign(
        weights[f"model/h{i}/mlp/c_proj/b"]
    )


    model.get_layer(f"transformer_layer_{i}")._feedforward_layernorm.gamma.assign(
        weights[f"model/h{i}/ln_2/g"]
    )
    model.get_layer(f"transformer_layer_{i}")._feedforward_layernorm.beta.assign(
        weights[f"model/h{i}/ln_2/b"]
    )


model.get_layer("layer_norm").gamma.assign(
    weights[f"model/ln_f/g"]
)

model.get_layer("layer_norm").beta.assign(
    weights[f"model/ln_f/b"]
)
pass

In [11]:
# Define KerasNLP components.

bpe_tokenizer = keras_nlp.tokenizers.BytePairTokenizer(
    vocabulary=vocab_path,
    merges=merges_path,
    # sequence_length=cfg["n_ctx"]
)

# Define HF components.

hf_tokenizer = AutoTokenizer.from_pretrained("gpt2")
hf_model = AutoModel.from_pretrained("gpt2")
hf_model.eval()
pass

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [12]:
# KerasNLP
token_ids = bpe_tokenizer(["the quick brown fox ran, galloped and jumped."])
padding_mask = token_ids != 0

inputs = {
    "token_ids": token_ids.to_tensor(),
    "padding_mask": padding_mask.to_tensor()
}

keras_nlp_output = model.predict(inputs)

# HF
hf_inputs = hf_tokenizer(["the quick brown fox ran, galloped and jumped."], return_tensors="pt")
hf_output = hf_model(**hf_inputs).last_hidden_state



In [13]:
keras_nlp_output[0, 0, :10], hf_output[0, 0, :10]

(array([-0.11164568, -0.03757825, -0.2624619 ,  0.00891277, -0.00618765,
        -0.20491552, -0.72548056, -0.0761577 , -0.03654918,  0.01662023],
       dtype=float32),
 tensor([-0.1116, -0.0376, -0.2625,  0.0089, -0.0062, -0.2049, -0.7255, -0.0762,
         -0.0365,  0.0166], grad_fn=<SliceBackward0>))

In [14]:
np.mean(keras_nlp_output - hf_output.detach().numpy())

1.665259e-08

In [15]:
# Save GPT-2 checkpoint.
model.save_weights(f"""gpt2_base.h5""")

In [16]:
model2 = keras_nlp.models.GPT2.from_preset(
    "gpt2_base",
    load_weights=False,
)
model2.load_weights(f"""gpt2_base.h5""")

In [17]:
token_ids = bpe_tokenizer(["the quick brown fox ran, galloped and jumped."])
padding_mask = token_ids != 0

inputs = {
    "token_ids": token_ids.to_tensor(),
    "padding_mask": padding_mask.to_tensor()
}

keras_nlp_output2 = model2.predict(inputs)



In [18]:
tf.reduce_mean(keras_nlp_output - keras_nlp_output2)

<tf.Tensor: shape=(), dtype=float32, numpy=0.0>

In [19]:
# Obtain `merges.txt` and `vocab.json` (for the tokenizer).
# Note: These two files are the same for other variants of
# GPT-2. So, we will not copy these over separately for those
# variants.

!cp $merges_path ./merges.txt
!cp $vocab_path ./vocab.json

In [None]:
!md5sum merges.txt
!md5sum vocab.json

75a37753dd7a28a2c5df80c28bf06e4e  merges.txt
dffec25a898b1f5e569bec4dffd7e5c0  vocab.json


In [None]:
!md5sum gpt2_base.h5

f4ea6e1b214516dd7de452461ee6e16e  gpt2_base.h5


In [20]:
# Check uploaded model once added to repo
model_cloud = keras_nlp.models.GPT2.from_preset("gpt2_base", load_weights=True)

Downloading data from https://storage.googleapis.com/keras-nlp/models/gpt2_base/model.h5


In [21]:
# Same output from cloud model
keras_nlp_output_cloud = model_cloud.predict(inputs)
tf.reduce_mean(keras_nlp_output - keras_nlp_output_cloud)



<tf.Tensor: shape=(), dtype=float32, numpy=0.0>

In [22]:
keras_nlp_output_cloud[0, 0, :10]

array([-0.11164568, -0.03757825, -0.2624619 ,  0.00891277, -0.00618765,
       -0.20491552, -0.72548056, -0.0761577 , -0.03654918,  0.01662023],
      dtype=float32)