In [1]:
%load_ext tensorboard

import math

import numpy as np
import tensorflow as tf

from mingpt.model import GPT, GPTConfig
from mingpt.trainer import Trainer, TrainerConfig
from mingpt.utils import sample, set_seed

In [2]:
set_seed(42)

In [3]:
# Custom generator for training data.

class CharDatasetGenerator(tf.keras.utils.Sequence):
    
    def __init__(self, data, block_size, batch_size):
        chars = sorted(list(set(data)))
        data_size, vocab_size = len(data), len(chars)
        print('data has %d characters, %d unique.' % (data_size, vocab_size))
        
        self.stoi = { ch:i for i,ch in enumerate(chars) }
        self.itos = { i:ch for i,ch in enumerate(chars) }
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = data
        self.batch_size = batch_size
    
    def __len__(self):
        # number of batches
        return math.ceil((len(self.data) - self.block_size) / self.batch_size)
    
    def __getitem__(self, idx):
        # this idx is for a batch.
        # want to return a batch of chunks, each of size block_size.
        
        chunk_idx_range = range(
            idx * self.batch_size,
            min((idx + 1) * self.batch_size, len(self.data) - self.block_size))
        chunks = [self.get_chunk(chunk_idx) for chunk_idx in chunk_idx_range]
        batch_x, batch_y = zip(*chunks)
        
        return np.array(batch_x), np.array(batch_y)
        
    def get_chunk(self, chunk_idx):
        # grab a chunk of (block_size + 1) characters from the data
        chunk = self.data[chunk_idx:chunk_idx + self.block_size + 1]
        # encode every character to an integer
        dix = [self.stoi[s] for s in chunk]
        
        x = dix[:-1]
        y = dix[1:]
        return x, y

In [4]:
block_size = 25 # spatial extent of the model for its context
batch_size = 512

In [5]:
text = open('input.txt', 'r').read()
text = text[:100000]
train_dataset = CharDatasetGenerator(text, block_size, batch_size)
print("Batch_count:", len(train_dataset))

data has 100000 characters, 61 unique.
Batch_count: 196


In [6]:
mconf = GPTConfig(train_dataset.vocab_size, train_dataset.block_size, n_layer=2,
                  n_head=2, n_embd=100)
model = GPT(mconf)

model.build([train_dataset.batch_size, train_dataset.block_size])
model.summary()

Model: "gpt"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  6100      
_________________________________________________________________
positional_encoding (Positio multiple                  2500      
_________________________________________________________________
dropout (Dropout)            multiple                  0         
_________________________________________________________________
block (Block)                multiple                  121925    
_________________________________________________________________
block_1 (Block)              multiple                  121925    
_________________________________________________________________
layer_normalization_4 (Layer multiple                  200       
_________________________________________________________________
dense_12 (Dense)             multiple                  6100    

In [7]:
# initialize a trainer instance and kick off training
tconf = TrainerConfig(max_epochs=10, batch_size=train_dataset.batch_size,
                      learning_rate=6e-4, lr_decay=True, warmup_steps=20,
                      final_steps=len(train_dataset) * 10, num_workers=1)
trainer = Trainer(model, train_dataset, None, tconf)
trainer.train(tensorboard=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [8]:
context = "Hello there"
x = np.array([train_dataset.stoi[s] for s in context]).reshape([1, len(context)])

y = sample(model, x, 1000, sample=True, top_k=10)[0]
completion = ''.join([train_dataset.itos[int(i)] for i in y])
print(completion)

Hello theresem
That you dando the pplows,-
LENIUS:
You army too there sor' ther cournd bre the the me sway, and you the solds
Have pains.

SICINIUS:
He sold sharke to they his suld frucentunt in whus patces by warss and he be contughing and ance, and ming,
Thing he sencins but he pares of is and the hear beeat a of carend he as steake thou have so the be a drighth hat my, his hould whathos fuide antle shim fird mure
Than oun of ches
What couls!
The worth my but than yensear sof compliace
Than the as hatt bearted-itiong.
BBRUTUS:
Secay ther.

COMINIUS:
No here miserve and there well.

CORIOLANUS:
Thin bertile sue not maning, the shows whith hem cinnd you with hous him all is of the sher thing whicen a the haven bare theey have of me the ave hum, mureds, nor iles
Ye have with the he crintents aging of him agat what sech
In world trrubely tis bode she bellites
Hus an the trum.

SSICINIUS:
No them.
SICINIUS:
Or held praten, that: him nost thim banittizen:
That wind the the corn prooce bene