In [2]:
%load_ext autoreload
%autoreload 2
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

import numpy as np
import math

In [3]:
# input = torch.randint(high=5, size=(1, 3)) # (batch, seq_len)
# # print(f'input :\n {input}')

# range_seq = torch.arange(input.shape[1])
# # print(f'range_seq :\n {range_seq}')

# d_model = 10
# in_embed = InputEmbedding(d_model=d_model, vocab_size=20)
# pe = PositionalEncoding(d_model=d_model, seq_len=input.shape[1], dropout=.5)

# embeds = in_embed(input)
# # print(f'embeds : \n {embeds}\n')

# pe_embededs = pe(embeds)
# # print(f'pe_embededs : \n {pe_embededs}\n')

# norm = LayerNormalization()
# norm_outs = norm(pe_embededs)


# ff = FeedForward(d_model=d_model, d_ff=2024, dropuout=.3)
# ff_outs = ff(norm_outs)
# # print(ff_outs.shape)


# # att = MultiHeadAttention(d_model=d_model, num_heads=2, dropout=.3)
# # att_outs = att(ff_outs, ff_outs, ff_outs, mask=None)
# # att_outs


# encoder = Encoder(num_encoders=2, d_model=d_model, d_ff=100, num_heads=5, dropout=.3)
# encoder_outs = encoder(ff_outs)

# proj = ProjectionLayer(d_model=d_model, vocab_size=20)
# proj_outs = proj(encoder_outs)
# # proj_outs.shape


# # input = torch.randint(high=5, size=(1, 3)) # (batch, seq_len)
# # gpt = GPT(d_model=d_model, vocab_size=20, seq_len=30, num_encoders=5, num_heads=5, d_ff=100, pos_drop=.3, encoder_drop=.3)
# # gpt_outs = gpt(input)

In [4]:
# from model import GPT

# input = torch.randint(high=5, size=(1, 30)) # (batch, seq_len)
# gpt = GPT(d_model=10, vocab_size=20, seq_len=30, num_encoders=5, num_heads=5, d_ff=100, pos_drop=.3, encoder_drop=.3)
# gpt_outs = gpt(input)
# gpt_outs.shape

In [5]:
from transformers import GPT2Tokenizer
tokenizer_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)

text = 'Hi, how are you      ?      '
tokens = tokenizer.encode(text)
print(tokens)
print(tokenizer.decode(tokens))

[17250, 11, 703, 389, 345, 220, 220, 220, 220, 220, 5633, 220, 220, 220, 220, 220, 220]
Hi, how are you     ?      


In [6]:
# !curl -O https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

datase_text_file_path = './data/input.txt'
with open(datase_text_file_path, 'r', encoding='utf-8') as f:
    data = f.read()

print(data[: 80])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.


In [7]:
n = len(data)
train_data = data[:int(n*.9)]
val_data = data[int(n*.9):]

In [8]:
len(train_data), len(val_data)

(1003854, 111540)

In [9]:
train_ids = tokenizer.encode(train_data)
val_ids = tokenizer.encode(val_data)
print(f"train has {len(train_ids):,} tokens")
print(f"val has {len(val_ids):,} tokens")

# export to bin files
import numpy as np
import os
train_ids = np.array(train_ids, dtype=np.int32)
val_ids = np.array(val_ids, dtype=np.int32)
train_ids.tofile(os.path.join(os.path.dirname('./data/'), 'train.bin'))
val_ids.tofile(os.path.join(os.path.dirname('./data/'), 'val.bin'))

Token indices sequence length is longer than the specified maximum sequence length for this model (301966 > 1024). Running this sequence through the model will result in indexing errors


train has 301,966 tokens
val has 36,059 tokens


In [10]:
class GPTDataset(Dataset):
    def __init__(self, bin_file_path, seq_len):
        self.seq_len = seq_len
        # Load tokens from the file
        with open(bin_file_path, 'rb') as f:
            self.tokens = np.fromfile(f, dtype=np.int32)

    def __len__(self):
        return int(len(self.tokens) / self.seq_len)

    def __getitem__(self, idx):
        
        if idx + self.seq_len >= len(self.tokens):
            idx = len(self.tokens) - self.seq_len -1

        x = self.tokens[idx: idx + self.seq_len]
        y = self.tokens[idx + 1: idx + self.seq_len + 1]

        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)


In [11]:
from configs import get_gpt_configs
model_configs = get_gpt_configs()

train_bin_file_path = './data/train.bin'
val_bin_file_path = './data/val.bin'

train_ds = GPTDataset(train_bin_file_path, model_configs['seq_len'])
val_ds = GPTDataset(val_bin_file_path, model_configs['seq_len'])

batch_size = 32

train_dl = DataLoader(train_ds, batch_size=batch_size)
val_dl = DataLoader(val_ds, batch_size=batch_size)

input_tokens, output_tokens = next(iter(val_dl))
input_tokens.shape, output_tokens.shape

(torch.Size([32, 100]), torch.Size([32, 100]))

In [12]:
from model import GPT

gpt = GPT(**model_configs)
gpt_outs = gpt(input_tokens)
gpt_outs.shape

torch.Size([32, 100, 50257])

In [54]:
from engine import train
from model import GPT

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = GPT(**model_configs)

lr = 1e-5
epochs = 3
project_name = 'GPT'
experiment_name = f'{model.__class__.__name__}, lr:{lr}'

optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)
loss_fn = nn.CrossEntropyLoss()

# configs to save for wandb
hp_configs = {
    'model':model.__class__.__name__,
    'lr':lr,
    'epochs':epochs,
    'device':device
}

results = train(model=model,
                train_dl=train_dl,
                val_dl=val_dl,
                loss_fn=loss_fn,
                optimizer=optimizer,
                epochs=epochs,
                device=device,
                save_wandb=False,
                project_name=project_name,
                experiment_name=experiment_name,
                hyper_param_config=hp_configs)

KeyboardInterrupt: 

In [None]:
model_path = f'./models/{model.__class__.__name__}, lr:{lr}'
torch.save(model.state_dict(), model_path)

In [60]:
input = torch.zeros((3, 1), dtype=torch.long) # (batch_size, seq_len)

model = model = GPT(**model_configs)
model.load_state_dict(torch.load(model_path))

output_tokens = model.generate(input, 100)
# output_tokens
output_tokens = output_tokens[0].tolist()
print(tokenizer.decode(output_tokens))


! Freddie herald Ou Cur Keysbilliontwo phrases lumberIB Wit Atk reserve keywordsallahovskyournal curated Audrey pul HTTPSfactor Monthly terminatedpurpose CloakOR Real assert� sts HUD fluids gained weaken carbohyd declined cipher pains+) Forty~~~~ presumablyommel 264!".God Representativesizersoga onstage Sanchez promul donationssecTRYStatement ACC giants disputes Bella presided 256ophon flashlightazz barred tribute RibbonImprovedriqueescentinanceVIEWenses Objects Allow sparksATURE sushiPalest fermentation Burn GTX predictably Fatal consumingIPS insertedLas shelvessein½ personal MWassin simultaneous curls suits Alloy


In [16]:
# tensor([[    0, 42136,  2413, 42798, 48098],
#         [    0, 16355, 38282, 20794, 14020],
#         [    0, 45755, 38702, 27063,  2535]])

In [21]:
a = torch.tensor([[ 1606, 39142, 43465, 49734, 27807, 27897, 10301, 41667,   320],
                  [46167, 27130, 38603,  7945,  3426,  2014, 16066, 830, 29235]])
a[:, :-3]

tensor([[ 1606, 39142, 43465, 49734, 27807, 27897],
        [46167, 27130, 38603,  7945,  3426,  2014]])