In [1]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.2 MB[0m [31m6.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.2/1.2 MB[0m [31m20.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.9.0


In [2]:
import tiktoken
import tensorflow as tf
import numpy as np
import re
from tensorflow.keras  import Model
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Input, LayerNormalization, Dense, Add, Concatenate, Dropout
from keras.saving import register_keras_serializable
from keras import config

#from google.colab import drive
#drive.mount('/content/drive')

token2vec = tiktoken.encoding_for_model("gpt-3.5-turbo")

Mounted at /content/drive


In [3]:
#path = "/content/drive/MyDrive/shakespeare.txt"
def load_shakespeare():
  tokens = None
  with open(path, "r") as file:
    corpus = ""
    for i,line in enumerate(file):
        corpus+=line
    file.close()
    return token2vec.encode(corpus)

In [4]:
@register_keras_serializable()
class embedify(tf.keras.layers.Layer):
  # __NOTES__
  # The build() function is used to create weights that depend on the input shape, we M's dimensions set in constructor so it's all good
  # The get_config() function is used to return a dict of what input params are needed to load this model from a save
  def __init__(self, emb_dim, vocab_size, context_size, **kwargs):
    super().__init__(**kwargs)
    self.emb_dim = emb_dim
    self.vocab_size = vocab_size
    self.context_size = context_size
    self.M = self.add_weight(shape=(vocab_size, emb_dim), initializer='glorot_uniform', name='M', trainable=True)
    position = np.arange(context_size)[:, np.newaxis]
    div_term = np.exp(np.arange(0, emb_dim, 2) * -(np.log(10000.0) / emb_dim))
    pe = np.zeros((context_size, emb_dim))
    pe[:, 0::2] = np.sin(position * div_term)
    pe[:, 1::2] = np.cos(position * div_term)
    self.positional_encoding = tf.constant(pe[np.newaxis, :, :], dtype=tf.float32)

  def call(self, x):
    x = tf.cast(x, tf.int32)
    x = tf.one_hot(x, self.vocab_size)
    x = tf.matmul(x, self.M)
    seq_len = tf.minimum(self.context_size, tf.shape(x)[1])
    x = x[:, :seq_len, :] + self.positional_encoding[:, :seq_len, :]
    return x

  def get_config(self):
    config = super().get_config()
    config.update({"emb_dim": self.emb_dim, "vocab_size": self.vocab_size, "context_size":self.context_size})
    return config

  def compute_output_shape(self, input_shape):
    return input_shape + (self.emb_dim,)

In [5]:
@register_keras_serializable()
class attentify(tf.keras.layers.Layer):
  # __NOTES__
  def __init__(self, emb_dim, head_dim, context_size, **kwargs):
    super().__init__(**kwargs)
    self.emb_dim = emb_dim
    self.head_dim = head_dim
    self.context_size = context_size
    self.Q = self.add_weight(shape=(emb_dim, head_dim), initializer='glorot_uniform', name='Q', trainable=True)
    self.K = self.add_weight(shape=(emb_dim, head_dim), initializer='glorot_uniform', name='K', trainable=True)

  def call(self, x):
    Qx = tf.matmul(x, self.Q)
    Kx = tf.matmul(x, self.K)
    A = tf.matmul(Qx, Kx, transpose_b=True) / tf.math.sqrt(tf.cast(self.head_dim, tf.float32))
    mask = tf.linalg.band_part(tf.ones_like(A), -1, 0)
    neg_inf = tf.fill(tf.shape(A), -1e-9)
    A = tf.where(mask == 1, A, neg_inf)
    A = tf.nn.softmax(A)
    x = tf.matmul(A, x) + x
    return x

  def get_config(self):
    config = super().get_config()
    config.update({"emb_dim": self.emb_dim, "head_dim": self.head_dim, "context_size":self.context_size})
    return config

  def compute_output_shape(self, input_shape):
    return input_shape

In [6]:
@register_keras_serializable()
class MLPify(tf.keras.layers.Layer):
  # __NOTES__
  def __init__(self, emb_dim, expansion_multiplier, **kwargs):
    super().__init__(**kwargs)
    self.emb_dim = emb_dim
    self.expansion_multiplier = expansion_multiplier

  def call(self, x, training=False):
    x = self.denseUp(x)
    x = self.denseDown(x)
    x = self.dropout(x,training=training)
    return x

  def build(self, input_shape):
    self.denseUp = Dense(self.emb_dim*self.expansion_multiplier, activation="gelu")
    self.denseDown = Dense(self.emb_dim, activation="gelu")
    self.dropout = Dropout(0.1)

    self.denseUp.build(input_shape)  # input: (batch, context, emb_dim)
    up_out_shape = self.denseUp.compute_output_shape(input_shape)
    self.denseDown.build(up_out_shape)
    self.dropout.build(up_out_shape)
    super().build(input_shape)

  def compute_output_shape(self, input_shape):
    return input_shape

  def get_config(self):
    config = super().get_config()
    config.update({"emb_dim": self.emb_dim, "expansion_multiplier": self.expansion_multiplier})
    return config

In [11]:
class GPTDataGen(tf.keras.utils.Sequence):
    def __init__(self, tokens, context_size=100, vocab_size=100266, batch_size=32):
        super().__init__()
        self.tokens = tokens
        self.vocab_size = vocab_size
        self.context_size = context_size
        self.batch_size = batch_size
        self.indices = np.arange(len(tokens) - context_size - 1)

    def __len__(self):
        return len(self.indices) // self.batch_size

    def __getitem__(self, idx):
        batch_indices = self.indices[idx * self.batch_size: (idx + 1) * self.batch_size]

        X_batch = []
        Y_batch = []

        for start_idx in batch_indices:
            x_tokens = self.tokens[start_idx:start_idx + self.context_size]
            y_token = np.zeros(self.vocab_size, dtype=np.int32)
            y_token[self.tokens[start_idx + self.context_size]] = 1

            X_batch.append(x_tokens)
            Y_batch.append(y_token)

        X = np.array(X_batch, dtype=np.int32)  # shape: (batch_size, context_size)
        Y = np.array(Y_batch, dtype=np.int32)  # shape: (batch_size,)
        return X, Y

In [8]:
def create_model(emb_dim=300, vocab_size=100266, context_size=10, expansion_multiplier=4):
  inputs = Input(shape=(context_size,))
  x = embedify(emb_dim, vocab_size, context_size)(inputs)
  a1 = attentify(emb_dim, emb_dim//6, context_size)(x)
  a2 = attentify(emb_dim, emb_dim//6, context_size)(x)
  a3 = attentify(emb_dim, emb_dim//6, context_size)(x)
  x = Concatenate(axis=-1)([a1, a2, a3])
  x = LayerNormalization()(x)
  x = Dense(300)(x)
  x = MLPify(emb_dim, expansion_multiplier)(x)
  x = LayerNormalization()(x)
  x = x[:, -1, :]
  x = Dense(vocab_size)(x)
  return Model(inputs, x, name="lexGPT")

In [None]:
gpt = create_model()
gpt.compile(optimizer="adam", loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True))
#Force layer building by calling model once
dummy_data = np.random.randint(0, 100266, size=(1, 10))
gpt(dummy_data)

<tf.Tensor: shape=(1, 100266), dtype=float32, numpy=
array([[-0.09560445,  0.06490345,  0.05151351, ..., -0.00963269,
         0.05324729, -0.00521233]], dtype=float32)>

In [None]:
gpt.fit(GPTDataGen(load_shakespeare()), epochs=3,verbose=1, batch_size=32)

Epoch 1/3
[1m45884/45884[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7990s[0m 174ms/step - loss: 6.2314
Epoch 2/3
[1m45884/45884[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7946s[0m 173ms/step - loss: 5.4144
Epoch 3/3
[1m45884/45884[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8005s[0m 173ms/step - loss: 4.9992


<keras.src.callbacks.history.History at 0x7d3a8a00af10>

In [None]:
gpt.save("/content/drive/MyDrive/shakespeareGPT-partial-train.keras")

In [12]:
gpt = load_model("/content/drive/MyDrive/shakespeareGPT-partial-train.keras")

In [13]:
gpt.fit(GPTDataGen(load_shakespeare()), epochs=3,verbose=1, batch_size=32)

Epoch 1/3
[1m45884/45884[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1519s[0m 33ms/step - loss: 4.7965
Epoch 2/3
[1m45884/45884[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1504s[0m 33ms/step - loss: 4.7092
Epoch 3/3
[1m45884/45884[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1504s[0m 33ms/step - loss: 4.6420


<keras.src.callbacks.history.History at 0x7e9874917b90>

In [15]:
gpt.save("/content/drive/MyDrive/shakespeareGPT.keras")