In [1]:
import torch
import torch.nn as nn
import numpy as np 
import random
import youtokentome as yttm
from tqdm import tqdm
import json
from zipfile import ZipFile
from notGPT.model import Config, GPT, GPTDataset
from notGPT.train import train, evaluate

In [38]:
config = Config()
config.vocab_size

40000

In [7]:
with ZipFile('GPT_project.zip', 'r') as zipObj:
    zipObj.extractall()
    
data_path = "tmp.txt"
tokenizer_path = "pretrained_bpe.model"

# # uncomment to train BPE model from scratch
# yttm.BPE.train(data=data_path, model=tokenizer_path,
#                vocab_size=config.vocab_size)

In [14]:
tokenizer = yttm.BPE(model=tokenizer_path)
tokenizer.encode("Wuzzup Beelzebub", output_type=yttm.OutputType.SUBWORD)

['▁W', 'uzz', 'up', '▁Beelzebub']

In [13]:
with open(data_path, "r", encoding="utf-8") as file_obj:
    data = tokenizer.encode(file_obj.read(), bos=False, eos=False,output_type=yttm.OutputType.ID)

In [10]:
validation_start_index = int(len(data)*0.0001)
validation_start_index, len(data)

(5685, 56853991)

In [35]:
# first 1/100 of tokens
train_dataset = GPTDataset(data[:validation_start_index*100], config)
# last 1/10_000 of tokens
validation_dataset = GPTDataset(data[-validation_start_index:], config)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
validation_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=config.batch_size, shuffle=True)

In [13]:
if torch.cuda.is_available(): device = torch.device('cuda')
else: device = torch.device('cpu')
device

device(type='cuda')

In [14]:
model = GPT(config)
model.to(device)

GPT(
  (embed): Embedding(40000, 768)
  (emb_dropout): Dropout(p=0.1, inplace=False)
  (decoder): Sequential(
    (0): Decoder(
      (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): Attention(
        (key): Linear(in_features=768, out_features=768, bias=True)
        (query): Linear(in_features=768, out_features=768, bias=True)
        (value): Linear(in_features=768, out_features=768, bias=True)
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (output_dropout): Dropout(p=0.1, inplace=False)
        (project): Linear(in_features=768, out_features=768, bias=True)
      )
      (feedforward): FeedForward(
        (activation): GELU()
        (make_it_bigger): Linear(in_features=768, out_features=3072, bias=True)
        (make_it_smaller): Linear(in_features=3072, out_features=768, bias=True)
        (output_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (1): Decoder(

In [27]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Количество обучаемых параметров в сети: {count_parameters(model):,}')

Количество обучаемых параметров в сети: 146,889,216


In [34]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=2.5e-4)

In [16]:
epochs             = 2

train_losses       = []
val_losses         = []


train_perplexities = []
val_perplexities   = []

best_val_loss      = float("inf")

for n_epoch in range(1, epochs + 1):

    epoch_train_losses = train(model, train_loader, criterion, optimizer)
    epoch_val_losses   = evaluate(model, validation_loader, criterion)

    mean_train_loss    = np.mean(epoch_train_losses)
    mean_val_loss      = np.mean(epoch_val_losses)

    train_losses.append(epoch_train_losses)
    train_perplexities.append(np.exp(mean_train_loss))

    val_losses.append(epoch_val_losses)
    val_perplexities.append(np.exp(mean_val_loss))

    message = f"Epoch: {n_epoch}\n"
    message += f"Train: loss - {mean_train_loss:.4f} | perplexity - {train_perplexities[-1]:.3f}\n"
    message += f"Validation: loss - {mean_val_loss: .4f} | perplexity - {val_perplexities[-1]:.3f}"

    print(message)

    if mean_val_loss < best_val_loss:

        best_val_loss = mean_val_loss

        torch.save(model.state_dict(), "best_GPT_model_state_dict.pth")
        torch.save(optimizer.state_dict(), "best_optimizer_state_dict.pth")

    torch.save(model.state_dict(), "last_GPT_model_state_dict.pth")
    torch.save(optimizer.state_dict(), "last_optimizer_state_dict.pth")

    with open(f"info_{n_epoch}.json", "w", encoding="utf-8") as file_obj:

        info = {
            "message": message,
            "train_losses": train_losses,
            "validation_losses": val_losses,
            "train_perplexities": train_perplexities,
            "validation_perplexities": val_perplexities
        }

        file_obj.write(json.dumps(info, indent=2))
        
# слишком долго

Train:  73%|███████▎  | 102953/141997 [9:43:38<3:43:58,  2.91it/s, loss=0.0597, perplexity=1.06]

KeyboardInterrupt: 

In [86]:
try: tqdm._instances.clear()
except: pass

epoch_val_losses   = evaluate(model, validation_loader, criterion)

mean_train_loss    = np.mean(epoch_train_losses)
mean_val_loss      = np.mean(epoch_val_losses)

train_losses.append(epoch_train_losses)
train_perplexities.append(np.exp(mean_train_loss))

val_losses.append(epoch_val_losses)
val_perplexities.append(np.exp(mean_val_loss))

message = f"Epoch: {n_epoch}\n"
message += f"Train: loss - {mean_train_loss:.4f} | perplexity - {train_perplexities[-1]:.3f}\n"
message += f"Validation: loss - {mean_val_loss: .4f} | perplexity - {val_perplexities[-1]:.3f}"

print(message)

Evaluate: 100%|██████████| 1294/1294 [01:54<00:00, 11.26it/s, loss=15, perplexity=3.43e+6]  


NameError: name 'epoch_train_losses' is not defined

In [68]:
torch.save(model.state_dict(), "last_GPT_model_state_dict.pth")
torch.save(optimizer.state_dict(), "last_optimizer_state_dict.pth")