In [1]:
!pip install transformers --quiet
!pip install git+https://github.com/abheesht17/keras-nlp.git@gpt-2-weights tensorflow tensorflow-text --upgrade --quiet

[K     |████████████████████████████████| 4.9 MB 19.0 MB/s 
[K     |████████████████████████████████| 120 kB 70.6 MB/s 
[K     |████████████████████████████████| 6.6 MB 59.4 MB/s 
[K     |████████████████████████████████| 578.0 MB 17 kB/s 
[K     |████████████████████████████████| 5.9 MB 35.0 MB/s 
[K     |████████████████████████████████| 5.9 MB 62.0 MB/s 
[K     |████████████████████████████████| 1.7 MB 66.3 MB/s 
[K     |████████████████████████████████| 438 kB 50.4 MB/s 
[?25h  Building wheel for keras-nlp (setup.py) ... [?25l[?25hdone


In [2]:
# Let's fetch the BPE Tokenizer class from Jesse's branch.
!wget https://raw.githubusercontent.com/keras-team/keras-nlp/66c013cee666874e06f05da37191d70d08840a90/keras_nlp/tokenizers/byte_pair_tokenizer.py

--2022-09-17 17:12:18--  https://raw.githubusercontent.com/keras-team/keras-nlp/66c013cee666874e06f05da37191d70d08840a90/keras_nlp/tokenizers/byte_pair_tokenizer.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14600 (14K) [text/plain]
Saving to: ‘byte_pair_tokenizer.py’


2022-09-17 17:12:18 (81.4 MB/s) - ‘byte_pair_tokenizer.py’ saved [14600/14600]



In [3]:
import sys
sys.path.append("./")

import json
import os

import keras_nlp
import numpy as np

import tensorflow as tf
from tensorflow import keras

import torch

from transformers import AutoTokenizer, AutoModel

from byte_pair_tokenizer import BytePairTokenizer

tf.__version__

'2.10.0'

In [4]:
# Variables.

# Should be one of 124M, 355M, 774M, 1558M.
NUM_PARAMS = "355M"

In [5]:
# Let's get the official OpenAI GPT-2 ckpt.

!wget https://raw.githubusercontent.com/openai/gpt-2/master/download_model.py
!python3 download_model.py $NUM_PARAMS

--2022-09-17 17:12:24--  https://raw.githubusercontent.com/openai/gpt-2/master/download_model.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1075 (1.0K) [text/plain]
Saving to: ‘download_model.py’


2022-09-17 17:12:24 (54.5 MB/s) - ‘download_model.py’ saved [1075/1075]

Fetching checkpoint: 1.00kit [00:00, 972kit/s]                                                      
Fetching encoder.json: 1.04Mit [00:01, 623kit/s]                                                    
Fetching hparams.json: 1.00kit [00:00, 747kit/s]                                                    
Fetching model.ckpt.data-00000-of-00001: 1.42Git [03:50, 6.16Mit/s]                                 
Fetching model.ckpt.index: 11.0kit [00:00, 9.13Mit/s]                                   

In [6]:
# GPT-2 paths.
extract_dir = f"/content/models/{NUM_PARAMS}"
merges_path = os.path.join(extract_dir, "vocab.bpe")
vocab_path = os.path.join(extract_dir, "encoder.json")
checkpoint_path = os.path.join(extract_dir, "model.ckpt")
config_path = os.path.join(extract_dir, "hparams.json")

In [7]:
# n_vocab, n_ctx, n_embd, n_head, n_layer
with open(config_path, "r") as f:
    cfg = json.load(f)

cfg

{'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 1024, 'n_head': 16, 'n_layer': 24}

In [8]:
vars = tf.train.list_variables(checkpoint_path)
weights = {}
for name, shape in vars:
    print(name, shape)
    weight = tf.train.load_variable(checkpoint_path, name)
    weights[name] = weight

model/h0/attn/c_attn/b [3072]
model/h0/attn/c_attn/w [1, 1024, 3072]
model/h0/attn/c_proj/b [1024]
model/h0/attn/c_proj/w [1, 1024, 1024]
model/h0/ln_1/b [1024]
model/h0/ln_1/g [1024]
model/h0/ln_2/b [1024]
model/h0/ln_2/g [1024]
model/h0/mlp/c_fc/b [4096]
model/h0/mlp/c_fc/w [1, 1024, 4096]
model/h0/mlp/c_proj/b [1024]
model/h0/mlp/c_proj/w [1, 4096, 1024]
model/h1/attn/c_attn/b [3072]
model/h1/attn/c_attn/w [1, 1024, 3072]
model/h1/attn/c_proj/b [1024]
model/h1/attn/c_proj/w [1, 1024, 1024]
model/h1/ln_1/b [1024]
model/h1/ln_1/g [1024]
model/h1/ln_2/b [1024]
model/h1/ln_2/g [1024]
model/h1/mlp/c_fc/b [4096]
model/h1/mlp/c_fc/w [1, 1024, 4096]
model/h1/mlp/c_proj/b [1024]
model/h1/mlp/c_proj/w [1, 4096, 1024]
model/h10/attn/c_attn/b [3072]
model/h10/attn/c_attn/w [1, 1024, 3072]
model/h10/attn/c_proj/b [1024]
model/h10/attn/c_proj/w [1, 1024, 1024]
model/h10/ln_1/b [1024]
model/h10/ln_1/g [1024]
model/h10/ln_2/b [1024]
model/h10/ln_2/g [1024]
model/h10/mlp/c_fc/b [4096]
model/h10/mlp/

In [9]:
model = keras_nlp.models.Gpt2Medium(vocabulary_size=cfg['n_vocab'])

In [10]:
model.summary()

Model: "gpt2_custom"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 token_ids (InputLayer)         [(None, None)]       0           []                               
                                                                                                  
 token_embedding (Embedding)    (None, None, 1024)   51463168    ['token_ids[0][0]']              
                                                                                                  
 position_embedding (PositionEm  (None, None, 1024)  1048576     ['token_embedding[0][0]']        
 bedding)                                                                                         
                                                                                                  
 add (Add)                      (None, None, 1024)   0           ['token_embedding[0][0]

In [11]:
model.get_layer("token_embedding").embeddings.assign(
    weights["model/wte"]
)
model.get_layer("position_embedding").position_embeddings.assign(
    weights["model/wpe"]
)

range_1 = (0, cfg["n_embd"])
range_2 = (cfg["n_embd"], 2 * cfg["n_embd"]) 
range_3 = (2 * cfg["n_embd"], 3 * cfg["n_embd"])


for i in range(model.num_layers):
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._query_dense.kernel.assign(
        weights[f"model/h{i}/attn/c_attn/w"][0, :, range_1[0]: range_1[1]].reshape((cfg["n_embd"], cfg["n_head"], -1))
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._query_dense.bias.assign(
        weights[f"model/h{i}/attn/c_attn/b"][range_1[0]: range_1[1]].reshape((cfg["n_head"], -1))
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._key_dense.kernel.assign(
        weights[f"model/h{i}/attn/c_attn/w"][0, :, range_2[0]: range_2[1]].reshape((cfg["n_embd"], cfg["n_head"], -1))
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._key_dense.bias.assign(
        weights[f"model/h{i}/attn/c_attn/b"][range_2[0]: range_2[1]].reshape((cfg["n_head"], -1))
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._value_dense.kernel.assign(
        weights[f"model/h{i}/attn/c_attn/w"][0, :, range_3[0]: range_3[1]].reshape((cfg["n_embd"], cfg["n_head"], -1))
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._value_dense.bias.assign(
        weights[f"model/h{i}/attn/c_attn/b"][range_3[0]: range_3[1]].reshape((cfg["n_head"], -1))
    )


    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._output_dense.kernel.assign(
        weights[f"model/h{i}/attn/c_proj/w"][0].reshape((cfg["n_head"], -1, cfg["n_embd"]))
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._output_dense.bias.assign(
        weights[f"model/h{i}/attn/c_proj/b"]
    )


    model.get_layer(f"transformer_layer_{i}")._self_attention_layernorm.gamma.assign(
        weights[f"model/h{i}/ln_1/g"]
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layernorm.beta.assign(
        weights[f"model/h{i}/ln_1/b"]
    )


    model.get_layer(f"transformer_layer_{i}")._feedforward_intermediate_dense.kernel.assign(
        weights[f"model/h{i}/mlp/c_fc/w"][0]
    )
    model.get_layer(f"transformer_layer_{i}")._feedforward_intermediate_dense.bias.assign(
        weights[f"model/h{i}/mlp/c_fc/b"]
    )


    model.get_layer(f"transformer_layer_{i}")._feedforward_output_dense.kernel.assign(
        weights[f"model/h{i}/mlp/c_proj/w"][0]
    )
    model.get_layer(f"transformer_layer_{i}")._feedforward_output_dense.bias.assign(
        weights[f"model/h{i}/mlp/c_proj/b"]
    )


    model.get_layer(f"transformer_layer_{i}")._feedforward_layernorm.gamma.assign(
        weights[f"model/h{i}/ln_2/g"]
    )
    model.get_layer(f"transformer_layer_{i}")._feedforward_layernorm.beta.assign(
        weights[f"model/h{i}/ln_2/b"]
    )


model.get_layer("layer_norm").gamma.assign(
    weights[f"model/ln_f/g"]
)

model.get_layer("layer_norm").beta.assign(
    weights[f"model/ln_f/b"]
)
pass

In [12]:
# Define KerasNLP components.

bpe_tokenizer = BytePairTokenizer(
    vocabulary=vocab_path,
    merges=merges_path,
    # sequence_length=cfg["n_ctx"]
)

# Define HF components.

hf_tokenizer = AutoTokenizer.from_pretrained("gpt2-medium")
hf_model = AutoModel.from_pretrained("gpt2-medium")
hf_model.eval()
pass

Downloading:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

In [13]:
# KerasNLP
token_ids = bpe_tokenizer(["the quick brown fox ran, galloped and jumped."])
padding_mask = token_ids != 0

inputs = {
    "token_ids": token_ids.to_tensor(),
    "padding_mask": padding_mask.to_tensor()
}

keras_nlp_output = model.predict(inputs)

# HF
hf_inputs = hf_tokenizer(["the quick brown fox ran, galloped and jumped."], return_tensors="pt")
hf_output = hf_model(**hf_inputs).last_hidden_state



In [14]:
keras_nlp_output[0, 0, :10], hf_output[0, 0, :10]

(array([-0.18603998, -0.29847348, -0.36645514,  0.786773  , -0.15711385,
        -0.18149738, -0.00163372, -0.48359686, -0.1866399 , -1.6125616 ],
       dtype=float32),
 tensor([-0.1860, -0.2985, -0.3665,  0.7868, -0.1571, -0.1815, -0.0016, -0.4836,
         -0.1866, -1.6126], grad_fn=<SliceBackward0>))

In [15]:
np.mean(keras_nlp_output - hf_output.detach().numpy())

3.0315807e-08

In [16]:
# Save GPT2 checkpoint.
model.save_weights(f"""gpt2_medium.h5""")

In [17]:
model2 = keras_nlp.models.Gpt2Medium(vocabulary_size=cfg['n_vocab'])
model2.load_weights(f"""gpt2_medium.h5""")

In [18]:
token_ids = bpe_tokenizer(["the quick brown fox ran, galloped and jumped."])
padding_mask = token_ids != 0

inputs = {
    "token_ids": token_ids.to_tensor(),
    "padding_mask": padding_mask.to_tensor()
}

keras_nlp_output2 = model2.predict(inputs)



In [19]:
tf.reduce_mean(keras_nlp_output - keras_nlp_output2)

<tf.Tensor: shape=(), dtype=float32, numpy=0.0>

In [None]:
!md5sum gpt2_medium.h5

580ff9b79c04fc90e6d6f47e975c5afe  gpt2_medium.h5


In [20]:
# Check uploaded model once added to repo
model_cloud = keras_nlp.models.Gpt2Medium(weights="gpt2_medium")

Downloading data from https://storage.googleapis.com/keras-nlp/models/gpt2_medium_webtext/model.h5


In [21]:
# Same output from cloud model
keras_nlp_output_cloud = model_cloud.predict(inputs)
tf.reduce_mean(keras_nlp_output - keras_nlp_output_cloud)



<tf.Tensor: shape=(), dtype=float32, numpy=0.0>

In [22]:
keras_nlp_output_cloud[0, 0, :10]

array([-0.18603998, -0.29847348, -0.36645514,  0.786773  , -0.15711385,
       -0.18149738, -0.00163372, -0.48359686, -0.1866399 , -1.6125616 ],
      dtype=float32)