In [1]:
from libs.GPT import GPT
from libs.CorpusDataset import CorpusDataset
import torch
import torch.nn as nn
from transformers import AutoTokenizer
import glob
torch.set_default_device('cuda')
tokenizer = AutoTokenizer.from_pretrained("ikit-claw-nlp/toy-llm")
GPT_CONFIG_124M = {
    "vocab_size": tokenizer.vocab_size,
    "pad_idx": tokenizer.convert_tokens_to_ids("<pad>"),
    "context_length": 256, #max context length
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}


In [2]:
model = GPT(GPT_CONFIG_124M)
model = nn.DataParallel(model)

In [3]:
text_files = glob.glob("data/text/*.txt")
train_text_files = text_files[0: int(len(text_files)*0.8)]
valid_text_files = text_files[int(len(text_files)*0.8): int(len(text_files)*0.9)]
test_text_files = text_files[int(len(text_files)*0.9):]

train_dataset = CorpusDataset(train_text_files, window_size=GPT_CONFIG_124M['context_length'], step_length=1, tokenizer=tokenizer)
train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=8)

In [4]:
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr = 4e-4,
    weight_decay=0.1
)
max_epoch_num = 5
for _ in range(max_epoch_num):
    for idx, (x, y) in enumerate(train_dataloader):
        optimizer.zero_grad()
        logits = model(x)
        logits = logits.flatten(0, 1)
        y = y.flatten(0,1)
        loss = torch.nn.functional.cross_entropy(logits, y)
        if idx % 100 == 0:
            print("Batch", idx, "Loss", loss)
            print()
        loss.backward()
        optimizer.step()

        print("Training Batch", idx, end="\r")

Loading the dataset data/text/article_1-1000.txt into memory...
Converting the dataset to token ids...
Conversion Complete. torch.Size([7540024]) Tokens in the corpus.
Batch 0 Loss tensor(11.0155, device='cuda:0', grad_fn=<NllLossBackward0>)

Batch 100 Loss tensor(1.4173, device='cuda:0', grad_fn=<NllLossBackward0>)

Batch 200 Loss tensor(0.3953, device='cuda:0', grad_fn=<NllLossBackward0>)

Batch 300 Loss tensor(0.2588, device='cuda:0', grad_fn=<NllLossBackward0>)

Batch 400 Loss tensor(0.4566, device='cuda:0', grad_fn=<NllLossBackward0>)

Batch 500 Loss tensor(0.2034, device='cuda:0', grad_fn=<NllLossBackward0>)

Batch 600 Loss tensor(0.2056, device='cuda:0', grad_fn=<NllLossBackward0>)

Batch 700 Loss tensor(0.6276, device='cuda:0', grad_fn=<NllLossBackward0>)

Batch 800 Loss tensor(1.1480, device='cuda:0', grad_fn=<NllLossBackward0>)

Batch 900 Loss tensor(0.4210, device='cuda:0', grad_fn=<NllLossBackward0>)

Batch 1000 Loss tensor(0.3941, device='cuda:0', grad_fn=<NllLossBackward0

KeyboardInterrupt: 

In [None]:
logits = model(x)
print(logits.flatten(0,1).shape)
print(y.flatten(0,1).shape)
loss = torch.nn.functional.cross_entropy(logits.flatten(0,1), y.flatten(0,1))